Picture by Marguerite Siboni
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("./80-cereals/cereal.csv")
df["cal_per_cup"] = df.calories / df.cups
df["cal_per_cup"].describe()
plt.figure(figsize=(4, 8))
sns.boxplot(df.cal_per_cup, orient="v", whis="range", color="#BFD0FE")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
plt.figure(figsize=(4, 8))
sns.boxplot(df.cal_per_cup, orient="v", color="#F6A6A0")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
plt.figure(figsize=(15, 8))
sns.boxplot(
["Bottom Shelf", "Middle Shelf", "Top Shelf"],
[
df[df.shelf == 1].cal_per_cup,
df[df.shelf == 2].cal_per_cup,
df[df.shelf == 3].cal_per_cup,
],
orient="v",
color="#F6A6A0",
)
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals by Shelf Placement", fontsize=20)
df[df.shelf == 3].cal_per_cup.describe()
plt.figure(figsize=(8, 4))
plt.hist(df.cal_per_cup, bins=20, color="#BDFCC8", edgecolor="#1F8F50")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
sns.jointplot(x=df.cal_per_cup, y=df.fiber, kind="scatter", color="#1F8F50")
plt.figure(figsize=(4, 8))
sns.violinplot(df.cal_per_cup, color="#F0BFFF", orient="v")
plt.ylabel("Calories Per Cup")
plt.title("Distribution of Calories in Cereals")
plt.figure(figsize=(15, 8))
sns.violinplot(x=df.shelf, y=df.cal_per_cup, color="#F0BFFF")
plt.xticks([0, 1, 2], ["Bottom shelf", "Middle Shelf", "Top Shelf"])
plt.ylabel("Calories Per Cup", fontsize=16)
plt.xlabel("Cereal Placement", fontsize=16)
plt.title("Distribution of Calories in Cereals by Shelf Placement", fontsize=20)
plt.tight_layout()
plt.figure(figsize=(15, 8))
plot = sns.violinplot(
x=df[(df.mfr == "K") | (df.mfr == "G")].shelf,
y=df[(df.mfr == "K") | (df.mfr == "G")].cal_per_cup,
hue=df[(df.mfr == "K") | (df.mfr == "G")].mfr,
split=True,
color="#F0BFFF",
)
handles, labels = plot.get_legend_handles_labels()
plt.xticks([0, 1, 2], ["Bottom Shelf", "Middle Shelf", "Top Shelf"])
plt.ylabel("Calories Per Cup", fontsize=16)
plt.xlabel("Cereal Placement", fontsize=16)
plt.legend(
[handles[0], handles[1]], ["Kelloggs", "General Mills"], title="Manufacturer"
)
plt.title(
"Distribution of Calories in Cereals \nby Shelf Placement and Manufacturer",
fontsize=20,
)
plt.tight_layout()