Some useful functions for Plotting Quantitative Variables
Picture by Marguerite Siboni
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv("./80-cereals/cereal.csv")
df["cal_per_cup"] = df.calories / df.cups
In [10]:
df["cal_per_cup"].describe()
Out[10]:
count     77.000000
mean     143.473469
std       60.224368
min       50.000000
25%      110.000000
50%      134.328358
75%      160.000000
max      440.000000
Name: cal_per_cup, dtype: float64
In [15]:
plt.figure(figsize=(4, 8))
sns.boxplot(df.cal_per_cup, orient="v", whis="range", color="#BFD0FE")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
Out[15]:
Text(0.5, 1.0, 'Distribution of Calories in Cereals')
In [16]:
plt.figure(figsize=(4, 8))
sns.boxplot(df.cal_per_cup, orient="v", color="#F6A6A0")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
Out[16]:
Text(0.5, 1.0, 'Distribution of Calories in Cereals')
In [24]:
plt.figure(figsize=(15, 8))
sns.boxplot(
    ["Bottom Shelf", "Middle Shelf", "Top Shelf"],
    [
        df[df.shelf == 1].cal_per_cup,
        df[df.shelf == 2].cal_per_cup,
        df[df.shelf == 3].cal_per_cup,
    ],
    orient="v",
    color="#F6A6A0",
)
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals by Shelf Placement", fontsize=20)
In [27]:
df[df.shelf == 3].cal_per_cup.describe()
Out[27]:
count     36.000000
mean     167.803282
std       77.216977
min       50.000000
25%      112.727273
50%      150.000000
75%      202.238806
max      440.000000
Name: cal_per_cup, dtype: float64
In [28]:
plt.figure(figsize=(8, 4))
plt.hist(df.cal_per_cup, bins=20, color="#BDFCC8", edgecolor="#1F8F50")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
Out[28]:
Text(0.5, 1.0, 'Distribution of Calories in Cereals')
In [29]:
sns.jointplot(x=df.cal_per_cup, y=df.fiber, kind="scatter", color="#1F8F50")
c:\users\you\anaconda3\lib\site-packages\seaborn\distributions.py:218: MatplotlibDeprecationWarning: 
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  color=hist_color, **hist_kws)
Out[29]:
<seaborn.axisgrid.JointGrid at 0x2a5316adeb8>
In [30]:
plt.figure(figsize=(4, 8))
sns.violinplot(df.cal_per_cup, color="#F0BFFF", orient="v")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
Out[30]:
Text(0.5, 1.0, 'Distribution of Calories in Cereals')
In [32]:
plt.figure(figsize=(15, 8))
sns.violinplot(x=df.shelf, y=df.cal_per_cup, color="#F0BFFF")
plt.xticks([0, 1, 2], ["Bottom shelf", "Middle Shelf", "Top Shelf"])
plt.ylabel("Calories Per Cup", fontsize=16)
plt.xlabel("Cereal Placement", fontsize=16)
plt.title("Distribution of Calories in Cereals by Shelf Placement", fontsize=20)
plt.tight_layout()
In [34]:
plt.figure(figsize=(15, 8))
plot = sns.violinplot(
    x=df[(df.mfr == "K") | (df.mfr == "G")].shelf,
    y=df[(df.mfr == "K") | (df.mfr == "G")].cal_per_cup,
    hue=df[(df.mfr == "K") | (df.mfr == "G")].mfr,
    split=True,
    color="#F0BFFF",
)
handles, labels = plot.get_legend_handles_labels()
plt.xticks([0, 1, 2], ["Bottom Shelf", "Middle Shelf", "Top Shelf"])
plt.ylabel("Calories Per Cup", fontsize=16)
plt.xlabel("Cereal Placement", fontsize=16)
plt.legend(
    [handles[0], handles[1]], ["Kelloggs", "General Mills"], title="Manufacturer"
)
plt.title(
    "Distribution of Calories in Cereals \nby Shelf Placement and Manufacturer",
    fontsize=20,
)
plt.tight_layout()