Spaces:
Sleeping
Sleeping
import math | |
import numpy as np | |
import pandas as pd | |
import requests | |
import seaborn as sns | |
import streamlit as st | |
from matplotlib import pyplot as plt | |
COLUMN_DISPLAY = [ | |
"code", | |
"shape", | |
"number_of_units", | |
"material", | |
"weight", | |
"is_main_component", | |
"weight_all_units", | |
"quantity_per_unit", | |
"product_quantity", | |
] | |
def get_category_names(): | |
category_names = {} | |
items = requests.get( | |
"https://static.openfoodfacts.org/data/taxonomies/categories.json" | |
).json() | |
for id_, item in items.items(): | |
category_names[id_] = item["name"].get("en", id_) | |
return category_names | |
def get_base_df(drop_uncomplete: bool = True): | |
df = pd.read_csv( | |
"https://world.openfoodfacts.org/data/packagings.packagings-with-weights.csv", | |
delimiter="\t", | |
dtype={"categories_tags": str, "code": str}, | |
) | |
# Don't keep elements with unknown categories | |
df = df.drop(df.categories_tags[df.categories_tags.isnull()].index) | |
# Fetch weight specified by the producer first and fallback on user measured weight otherwise | |
df["weight"] = [ | |
(weight_specified if not math.isnan(weight_specified) else weight_measured) | |
for (weight_specified, weight_measured) in zip( | |
df.weight_specified, df.weight_measured | |
) | |
] | |
if drop_uncomplete: | |
# Drop incomplete products, that don't have weights or number of units for all elements | |
df["missing_data"] = np.isnan(df.number_of_units) | np.isnan(df.weight) | |
missing_data_group_by = ( | |
df.loc[:, ["code", "missing_data"]].groupby("code").any() | |
) | |
df["missing_data"] = missing_data_group_by.loc[ | |
df.code.values, "missing_data" | |
].values | |
df = df.drop(df.missing_data[df.missing_data].index).drop( | |
"missing_data", axis=1 | |
) | |
else: | |
df = df.drop(df.number_of_units[np.isnan(df.number_of_units)].index) | |
# We compute the packaging weight per 100g of product | |
df["weight_per_100g_of_product"] = df.weight * 100 / df.product_quantity | |
# We don't need weight_measured and weight_specified anymore | |
df.drop(["weight_measured", "weight_specified"], axis="columns") | |
# weight_all_units is the combined weight of all units of this element | |
df["weight_all_units"] = df.weight * df.number_of_units | |
# This is used to find the main element of the product (the one with the largest weight) | |
max_group_by = ( | |
df.loc[:, ["code", "weight_all_units"]] | |
.groupby("code") | |
.max() | |
# Max over a a group of NaN produces NaN, replace by -1 to prevent setting | |
# as main element an element with NaN weight | |
.fillna(-1) | |
) | |
max_weight = max_group_by.loc[df.code.values, "weight_all_units"].values | |
df["is_main_component"] = df.weight_all_units == max_weight | |
sum_group_by = df.loc[:, ["code", "weight_all_units"]].groupby("code").sum() | |
df["percent_total_weight"] = ( | |
df.weight_all_units | |
* 100 | |
/ sum_group_by.loc[df.code.values, "weight_all_units"].values | |
) | |
return df | |
def reset_plotting_context(): | |
sns.set_context("paper", font_scale=0.8) | |
def display_ratio_charts( | |
df: pd.DataFrame, group_name: str = "parent_material", display_by_row: bool = False | |
): | |
df_with_weight_ratio = df[~df.weight_per_100g_of_product.isnull()] | |
st.markdown(f"{len(df_with_weight_ratio)} products.") | |
order = df_with_weight_ratio[group_name].value_counts().index | |
hue_order = df_with_weight_ratio["shape"].value_counts().index | |
kwargs = ( | |
{"row": group_name, "row_order": order, "sharex": False} | |
if display_by_row | |
else {"col": group_name, "col_order": order} | |
) | |
with sns.plotting_context("paper", font_scale=0.6): | |
g = sns.FacetGrid( | |
df_with_weight_ratio, hue="shape", hue_order=hue_order, **kwargs | |
) | |
g.map(sns.swarmplot, "weight_per_100g_of_product", size=2.5, alpha=0.7) | |
axes = [ax[0] for ax in g.axes] if display_by_row else g.axes[0] | |
for ax, plot_name in zip(axes, order): | |
sns.violinplot( | |
data=df_with_weight_ratio[ | |
df_with_weight_ratio[group_name] == plot_name | |
], | |
x="weight_per_100g_of_product", | |
color=".9", | |
inner=None, | |
ax=ax, | |
) | |
ax.set_xlabel(None) | |
ax.xaxis.set_major_formatter(lambda x, pos: f"{x:.2f} g") | |
g.add_legend() | |
return g | |
def display_charts(df, target_category): | |
st.markdown( | |
"""--- | |
In this analysis, we drop all packaging elements with an unknown number of units.""" | |
) | |
filtered_df = df.loc[df.categories_tags.str.contains(target_category).values, :] | |
if len(filtered_df) == 0: | |
st.markdown("No items") | |
return | |
main_component_df = filtered_df.loc[df.is_main_component, :] | |
st.markdown( | |
"""The graph below shows the distribution of the total packaging weight, | |
by summing the weights of all packaging components (the number of units is taken into account).""" | |
) | |
sns.set_theme() | |
reset_plotting_context() | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
weight_sum_df = filtered_df.loc[:, ["code", "weight"]].groupby("code").sum() | |
ax = sns.histplot(weight_sum_df, x="weight", ax=ax) | |
ax.set( | |
xlabel=f"Packaging weight (all elements) [n={len(weight_sum_df)}]", | |
ylabel="Count", | |
) | |
st.pyplot(fig, clear_figure=True) | |
st.markdown( | |
"""## Main packaging element | |
The main element is the packaging element with the largest weight, obtained | |
by multiplying the element weight by the number of units.""" | |
) | |
st.markdown(f"{len(main_component_df.code.unique())} products.") | |
shape_count_df = ( | |
main_component_df.loc[:, ["code", "shape"]] | |
.groupby("shape", as_index=False) | |
.count() | |
.sort_values("code", ascending=False) | |
.rename({"code": "count"}, axis="columns") | |
) | |
shape_count_df["percent"] = ( | |
shape_count_df["count"] * 100 / shape_count_df["count"].sum() | |
) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.barplot( | |
data=shape_count_df, y="shape", x="percent", palette="pastel", orient="h", ax=ax | |
) | |
ax.bar_label(ax.containers[0], labels=shape_count_df["count"], label_type="center") | |
ax.set(xlabel="Percentage of main elements with shape (%)", ylabel="Shape") | |
st.pyplot(fig=fig, clear_figure=True) | |
material_count_df = ( | |
main_component_df.loc[:, ["code", "parent_material"]] | |
.groupby("parent_material", as_index=False) | |
.count() | |
.sort_values("code", ascending=False) | |
.rename({"code": "count"}, axis="columns") | |
) | |
material_count_df["percent"] = ( | |
material_count_df["count"] * 100 / shape_count_df["count"].sum() | |
) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.barplot( | |
data=material_count_df, | |
ax=ax, | |
y="parent_material", | |
x="percent", | |
palette="pastel", | |
orient="h", | |
) | |
ax.bar_label( | |
ax.containers[0], labels=material_count_df["count"], label_type="center" | |
) | |
ax.set(xlabel="Percentage of main elements with material (%)", ylabel="Material") | |
st.pyplot(fig=fig, clear_figure=True) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.violinplot( | |
data=main_component_df, | |
y="parent_material", | |
x="weight", | |
ax=ax, | |
inner=None, | |
orient="h", | |
color="k", | |
) | |
sns.swarmplot( | |
data=main_component_df, | |
y="parent_material", | |
x="weight", | |
ax=ax, | |
palette="colorblind", | |
hue="shape", | |
) | |
ax.set_xlabel("Main element weight - one unit (g)") | |
ax.set_ylabel("Material") | |
st.pyplot(fig=fig, clear_figure=True) | |
st.markdown( | |
"""### Weight of the main packaging element per 100g of product | |
We divide the weight of the main packaging element with the product weight | |
and multiply it by 100, to get the weight per 100 g of product. | |
We ignore in this analysis products without product weight.""" | |
) | |
g = display_ratio_charts(main_component_df, display_by_row=True) | |
st.pyplot(fig=g.figure, clear_figure=True) | |
if (main_component_df.parent_material == "en:plastic").any(): | |
plastic_df = main_component_df[ | |
main_component_df.parent_material == "en:plastic" | |
] | |
st.markdown( | |
"""## Plastic-specific analysis | |
This product category contains main elements with plastic, so we display an analysis of the type of plastic of the main element.""" | |
) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.violinplot( | |
data=plastic_df, | |
y="material", | |
x="weight", | |
ax=ax, | |
inner=None, | |
orient="h", | |
color="k", | |
) | |
sns.swarmplot( | |
data=plastic_df, | |
y="material", | |
x="weight", | |
ax=ax, | |
palette="colorblind", | |
hue="shape", | |
) | |
ax.set_xlabel("Main element weight (plastic only) - one unit (g)") | |
ax.set_ylabel("Plastic material") | |
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) | |
st.pyplot(fig=fig, clear_figure=True) | |
st.markdown("### Weight of the main packaging element per 100g of product") | |
g = display_ratio_charts(plastic_df, group_name="material", display_by_row=True) | |
st.pyplot(fig=g.figure, clear_figure=True) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.swarmplot( | |
data=main_component_df, | |
y="parent_material", | |
x="percent_total_weight", | |
palette="colorblind", | |
hue="shape", | |
ax=ax, | |
) | |
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) | |
ax.set(xlabel="Percent of total packaging weight (%)", ylabel="Material") | |
st.pyplot(fig=fig, clear_figure=True) | |
st.markdown("## Number of packaging elements") | |
total_num_units_df = ( | |
filtered_df.loc[:, ["code", "number_of_units"]].groupby("code").sum() | |
) | |
fig = plt.figure() | |
ax = fig.add_subplot(1, 1, 1) | |
sns.histplot(total_num_units_df, x="number_of_units", ax=ax) | |
ax.set( | |
xlabel="Distribution of the total number of packaging elements", ylabel="Count" | |
) | |
st.pyplot(fig=fig, clear_figure=True) | |
with st.expander("Show data"): | |
st.markdown("Weight sum dataframe") | |
st.write(weight_sum_df) | |
st.markdown(f"Dataframe: category={target_category}") | |
st.write(filtered_df.loc[:, COLUMN_DISPLAY]) | |
st.markdown("Number of units dataframe") | |
st.write(total_num_units_df) | |
st.title("Packaging analysis") | |
st.markdown( | |
"""You can explore the Open Food Facts packaging data using this demo. | |
Start by providing a category to analyze.""" | |
) | |
category_names = get_category_names() | |
category_ids = list(category_names.keys()) | |
category = st.selectbox( | |
"Category", | |
options=category_ids, | |
format_func=lambda x: category_names[x], | |
help="Category to analyze", | |
index=category_ids.index("en:yogurts"), | |
) | |
drop_uncomplete = st.checkbox( | |
"Drop uncomplete", | |
value=True, | |
help="Drop all products that don't have complete packaging " | |
"(i.e weight and number of units for all elements)", | |
) | |
df = get_base_df(drop_uncomplete=drop_uncomplete) | |
display_charts(df, category) | |