Raphaël Bournhonesque
commited on
Commit
•
71990fd
1
Parent(s):
075b7dd
first commit
Browse files- README.md +4 -4
- app.py +346 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Packaging Analysis
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
title: Packaging Analysis
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.25.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import requests
|
6 |
+
import seaborn as sns
|
7 |
+
import streamlit as st
|
8 |
+
from matplotlib import pyplot as plt
|
9 |
+
|
10 |
+
COLUMN_DISPLAY = [
|
11 |
+
"code",
|
12 |
+
"shape",
|
13 |
+
"number_of_units",
|
14 |
+
"material",
|
15 |
+
"weight",
|
16 |
+
"is_main_component",
|
17 |
+
"weight_all_units",
|
18 |
+
"quantity_per_unit",
|
19 |
+
"product_quantity",
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
@st.cache_data
|
24 |
+
def get_category_names():
|
25 |
+
category_names = {}
|
26 |
+
items = requests.get(
|
27 |
+
"https://static.openfoodfacts.org/data/taxonomies/categories.json"
|
28 |
+
).json()
|
29 |
+
for id_, item in items.items():
|
30 |
+
category_names[id_] = item["name"].get("en", id_)
|
31 |
+
return category_names
|
32 |
+
|
33 |
+
|
34 |
+
@st.cache_data
|
35 |
+
def get_base_df(drop_uncomplete: bool = True):
|
36 |
+
df = pd.read_csv(
|
37 |
+
"https://world.openfoodfacts.org/data/packagings.packagings-with-weights.csv",
|
38 |
+
delimiter="\t",
|
39 |
+
dtype={"categories_tags": str, "code": str},
|
40 |
+
)
|
41 |
+
# Don't keep elements with unknown categories
|
42 |
+
df = df.drop(df.categories_tags[df.categories_tags.isnull()].index)
|
43 |
+
|
44 |
+
# Fetch weight specified by the producer first and fallback on user measured weight otherwise
|
45 |
+
df["weight"] = [
|
46 |
+
(weight_specified if not math.isnan(weight_specified) else weight_measured)
|
47 |
+
for (weight_specified, weight_measured) in zip(
|
48 |
+
df.weight_specified, df.weight_measured
|
49 |
+
)
|
50 |
+
]
|
51 |
+
|
52 |
+
if drop_uncomplete:
|
53 |
+
# Drop incomplete products, that don't have weights or number of units for all elements
|
54 |
+
df["missing_data"] = np.isnan(df.number_of_units) | np.isnan(df.weight)
|
55 |
+
missing_data_group_by = (
|
56 |
+
df.loc[:, ["code", "missing_data"]].groupby("code").any()
|
57 |
+
)
|
58 |
+
df["missing_data"] = missing_data_group_by.loc[
|
59 |
+
df.code.values, "missing_data"
|
60 |
+
].values
|
61 |
+
df = df.drop(df.missing_data[df.missing_data].index).drop(
|
62 |
+
"missing_data", axis=1
|
63 |
+
)
|
64 |
+
else:
|
65 |
+
df = df.drop(df.number_of_units[np.isnan(df.number_of_units)].index)
|
66 |
+
|
67 |
+
# We compute the packaging weight per 100g of product
|
68 |
+
df["weight_per_100g_of_product"] = df.weight * 100 / df.product_quantity
|
69 |
+
# We don't need weight_measured and weight_specified anymore
|
70 |
+
df.drop(["weight_measured", "weight_specified"], axis="columns")
|
71 |
+
|
72 |
+
# weight_all_units is the combined weight of all units of this element
|
73 |
+
df["weight_all_units"] = df.weight * df.number_of_units
|
74 |
+
# This is used to find the main element of the product (the one with the largest weight)
|
75 |
+
max_group_by = (
|
76 |
+
df.loc[:, ["code", "weight_all_units"]]
|
77 |
+
.groupby("code")
|
78 |
+
.max()
|
79 |
+
# Max over a a group of NaN produces NaN, replace by -1 to prevent setting
|
80 |
+
# as main element an element with NaN weight
|
81 |
+
.fillna(-1)
|
82 |
+
)
|
83 |
+
max_weight = max_group_by.loc[df.code.values, "weight_all_units"].values
|
84 |
+
df["is_main_component"] = df.weight_all_units == max_weight
|
85 |
+
sum_group_by = df.loc[:, ["code", "weight_all_units"]].groupby("code").sum()
|
86 |
+
df["percent_total_weight"] = (
|
87 |
+
df.weight_all_units
|
88 |
+
* 100
|
89 |
+
/ sum_group_by.loc[df.code.values, "weight_all_units"].values
|
90 |
+
)
|
91 |
+
return df
|
92 |
+
|
93 |
+
|
94 |
+
def reset_plotting_context():
|
95 |
+
sns.set_context("paper", font_scale=0.8)
|
96 |
+
|
97 |
+
|
98 |
+
def display_ratio_charts(
|
99 |
+
df: pd.DataFrame, group_name: str = "parent_material", display_by_row: bool = False
|
100 |
+
):
|
101 |
+
df_with_weight_ratio = df[~df.weight_per_100g_of_product.isnull()]
|
102 |
+
st.markdown(f"{len(df_with_weight_ratio)} products.")
|
103 |
+
order = df_with_weight_ratio[group_name].value_counts().index
|
104 |
+
hue_order = df_with_weight_ratio["shape"].value_counts().index
|
105 |
+
kwargs = (
|
106 |
+
{"row": group_name, "row_order": order, "sharex": False}
|
107 |
+
if display_by_row
|
108 |
+
else {"col": group_name, "col_order": order}
|
109 |
+
)
|
110 |
+
with sns.plotting_context("paper", font_scale=0.6):
|
111 |
+
g = sns.FacetGrid(
|
112 |
+
df_with_weight_ratio, hue="shape", hue_order=hue_order, **kwargs
|
113 |
+
)
|
114 |
+
g.map(sns.swarmplot, "weight_per_100g_of_product", size=2.5, alpha=0.7)
|
115 |
+
axes = [ax[0] for ax in g.axes] if display_by_row else g.axes[0]
|
116 |
+
for ax, plot_name in zip(axes, order):
|
117 |
+
sns.violinplot(
|
118 |
+
data=df_with_weight_ratio[
|
119 |
+
df_with_weight_ratio[group_name] == plot_name
|
120 |
+
],
|
121 |
+
x="weight_per_100g_of_product",
|
122 |
+
color=".9",
|
123 |
+
inner=None,
|
124 |
+
ax=ax,
|
125 |
+
)
|
126 |
+
ax.set_xlabel(None)
|
127 |
+
ax.xaxis.set_major_formatter(lambda x, pos: f"{x:.2f} g")
|
128 |
+
g.add_legend()
|
129 |
+
return g
|
130 |
+
|
131 |
+
|
132 |
+
def display_charts(df, target_category):
|
133 |
+
st.markdown(
|
134 |
+
"""---
|
135 |
+
|
136 |
+
In this analysis, we drop all packaging elements with an unknown number of units."""
|
137 |
+
)
|
138 |
+
filtered_df = df.loc[df.categories_tags.str.contains(target_category).values, :]
|
139 |
+
|
140 |
+
if len(filtered_df) == 0:
|
141 |
+
st.markdown("No items")
|
142 |
+
return
|
143 |
+
|
144 |
+
main_component_df = filtered_df.loc[df.is_main_component, :]
|
145 |
+
|
146 |
+
st.markdown(
|
147 |
+
"""The graph below shows the distribution of the total packaging weight,
|
148 |
+
by summing the weights of all packaging components (the number of units is taken into account)."""
|
149 |
+
)
|
150 |
+
|
151 |
+
sns.set_theme()
|
152 |
+
reset_plotting_context()
|
153 |
+
|
154 |
+
fig = plt.figure()
|
155 |
+
ax = fig.add_subplot(1, 1, 1)
|
156 |
+
weight_sum_df = filtered_df.loc[:, ["code", "weight"]].groupby("code").sum()
|
157 |
+
ax = sns.histplot(weight_sum_df, x="weight", ax=ax)
|
158 |
+
ax.set(
|
159 |
+
xlabel=f"Packaging weight (all elements) [n={len(weight_sum_df)}]",
|
160 |
+
ylabel="Count",
|
161 |
+
)
|
162 |
+
st.pyplot(fig, clear_figure=True)
|
163 |
+
|
164 |
+
st.markdown(
|
165 |
+
"""## Main packaging element
|
166 |
+
|
167 |
+
The main element is the packaging element with the largest weight, obtained
|
168 |
+
by multiplying the element weight by the number of units."""
|
169 |
+
)
|
170 |
+
st.markdown(f"{len(main_component_df.code.unique())} products.")
|
171 |
+
|
172 |
+
shape_count_df = (
|
173 |
+
main_component_df.loc[:, ["code", "shape"]]
|
174 |
+
.groupby("shape", as_index=False)
|
175 |
+
.count()
|
176 |
+
.sort_values("code", ascending=False)
|
177 |
+
.rename({"code": "count"}, axis="columns")
|
178 |
+
)
|
179 |
+
shape_count_df["percent"] = (
|
180 |
+
shape_count_df["count"] * 100 / shape_count_df["count"].sum()
|
181 |
+
)
|
182 |
+
fig = plt.figure()
|
183 |
+
ax = fig.add_subplot(1, 1, 1)
|
184 |
+
sns.barplot(
|
185 |
+
data=shape_count_df, y="shape", x="percent", palette="pastel", orient="h", ax=ax
|
186 |
+
)
|
187 |
+
ax.bar_label(ax.containers[0], labels=shape_count_df["count"], label_type="center")
|
188 |
+
ax.set(xlabel="Percentage of main elements with shape (%)", ylabel="Shape")
|
189 |
+
st.pyplot(fig=fig, clear_figure=True)
|
190 |
+
|
191 |
+
material_count_df = (
|
192 |
+
main_component_df.loc[:, ["code", "parent_material"]]
|
193 |
+
.groupby("parent_material", as_index=False)
|
194 |
+
.count()
|
195 |
+
.sort_values("code", ascending=False)
|
196 |
+
.rename({"code": "count"}, axis="columns")
|
197 |
+
)
|
198 |
+
material_count_df["percent"] = (
|
199 |
+
material_count_df["count"] * 100 / shape_count_df["count"].sum()
|
200 |
+
)
|
201 |
+
fig = plt.figure()
|
202 |
+
ax = fig.add_subplot(1, 1, 1)
|
203 |
+
sns.barplot(
|
204 |
+
data=material_count_df,
|
205 |
+
ax=ax,
|
206 |
+
y="parent_material",
|
207 |
+
x="percent",
|
208 |
+
palette="pastel",
|
209 |
+
orient="h",
|
210 |
+
)
|
211 |
+
ax.bar_label(
|
212 |
+
ax.containers[0], labels=material_count_df["count"], label_type="center"
|
213 |
+
)
|
214 |
+
ax.set(xlabel="Percentage of main elements with material (%)", ylabel="Material")
|
215 |
+
st.pyplot(fig=fig, clear_figure=True)
|
216 |
+
|
217 |
+
fig = plt.figure()
|
218 |
+
ax = fig.add_subplot(1, 1, 1)
|
219 |
+
sns.violinplot(
|
220 |
+
data=main_component_df,
|
221 |
+
y="parent_material",
|
222 |
+
x="weight",
|
223 |
+
ax=ax,
|
224 |
+
inner=None,
|
225 |
+
orient="h",
|
226 |
+
color="k",
|
227 |
+
)
|
228 |
+
sns.swarmplot(
|
229 |
+
data=main_component_df,
|
230 |
+
y="parent_material",
|
231 |
+
x="weight",
|
232 |
+
ax=ax,
|
233 |
+
palette="colorblind",
|
234 |
+
hue="shape",
|
235 |
+
)
|
236 |
+
ax.set_xlabel("Main element weight - one unit (g)")
|
237 |
+
ax.set_ylabel("Material")
|
238 |
+
st.pyplot(fig=fig, clear_figure=True)
|
239 |
+
|
240 |
+
st.markdown(
|
241 |
+
"""### Weight of the main packaging element per 100g of product
|
242 |
+
|
243 |
+
We divide the weight of the main packaging element with the product weight
|
244 |
+
and multiply it by 100, to get the weight per 100 g of product.
|
245 |
+
|
246 |
+
We ignore in this analysis products without product weight."""
|
247 |
+
)
|
248 |
+
g = display_ratio_charts(main_component_df, display_by_row=True)
|
249 |
+
st.pyplot(fig=g.figure, clear_figure=True)
|
250 |
+
|
251 |
+
if (main_component_df.parent_material == "en:plastic").any():
|
252 |
+
plastic_df = main_component_df[
|
253 |
+
main_component_df.parent_material == "en:plastic"
|
254 |
+
]
|
255 |
+
st.markdown(
|
256 |
+
"""## Plastic-specific analysis
|
257 |
+
This product category contains main elements with plastic, so we display an analysis of the type of plastic of the main element."""
|
258 |
+
)
|
259 |
+
|
260 |
+
fig = plt.figure()
|
261 |
+
ax = fig.add_subplot(1, 1, 1)
|
262 |
+
sns.violinplot(
|
263 |
+
data=plastic_df,
|
264 |
+
y="material",
|
265 |
+
x="weight",
|
266 |
+
ax=ax,
|
267 |
+
inner=None,
|
268 |
+
orient="h",
|
269 |
+
color="k",
|
270 |
+
)
|
271 |
+
sns.swarmplot(
|
272 |
+
data=plastic_df,
|
273 |
+
y="material",
|
274 |
+
x="weight",
|
275 |
+
ax=ax,
|
276 |
+
palette="colorblind",
|
277 |
+
hue="shape",
|
278 |
+
)
|
279 |
+
ax.set_xlabel("Main element weight (plastic only) - one unit (g)")
|
280 |
+
ax.set_ylabel("Plastic material")
|
281 |
+
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
|
282 |
+
st.pyplot(fig=fig, clear_figure=True)
|
283 |
+
|
284 |
+
st.markdown("### Weight of the main packaging element per 100g of product")
|
285 |
+
g = display_ratio_charts(plastic_df, group_name="material", display_by_row=True)
|
286 |
+
st.pyplot(fig=g.figure, clear_figure=True)
|
287 |
+
|
288 |
+
fig = plt.figure()
|
289 |
+
ax = fig.add_subplot(1, 1, 1)
|
290 |
+
sns.swarmplot(
|
291 |
+
data=main_component_df,
|
292 |
+
y="parent_material",
|
293 |
+
x="percent_total_weight",
|
294 |
+
palette="colorblind",
|
295 |
+
hue="shape",
|
296 |
+
ax=ax,
|
297 |
+
)
|
298 |
+
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
|
299 |
+
ax.set(xlabel="Percent of total packaging weight (%)", ylabel="Material")
|
300 |
+
st.pyplot(fig=fig, clear_figure=True)
|
301 |
+
|
302 |
+
st.markdown("## Number of packaging elements")
|
303 |
+
total_num_units_df = (
|
304 |
+
filtered_df.loc[:, ["code", "number_of_units"]].groupby("code").sum()
|
305 |
+
)
|
306 |
+
fig = plt.figure()
|
307 |
+
ax = fig.add_subplot(1, 1, 1)
|
308 |
+
sns.histplot(total_num_units_df, x="number_of_units", ax=ax)
|
309 |
+
ax.set(
|
310 |
+
xlabel="Distribution of the total number of packaging elements", ylabel="Count"
|
311 |
+
)
|
312 |
+
st.pyplot(fig=fig, clear_figure=True)
|
313 |
+
|
314 |
+
with st.expander("Show data"):
|
315 |
+
st.markdown("Weight sum dataframe")
|
316 |
+
st.write(weight_sum_df)
|
317 |
+
st.markdown(f"Dataframe: category={target_category}")
|
318 |
+
st.write(filtered_df.loc[:, COLUMN_DISPLAY])
|
319 |
+
st.markdown("Number of units dataframe")
|
320 |
+
st.write(total_num_units_df)
|
321 |
+
|
322 |
+
|
323 |
+
st.title("Packaging analysis")
|
324 |
+
st.markdown(
|
325 |
+
"""You can explore the Open Food Facts packaging data using this demo.
|
326 |
+
Start by providing a category to analyze."""
|
327 |
+
)
|
328 |
+
|
329 |
+
category_names = get_category_names()
|
330 |
+
category_ids = list(category_names.keys())
|
331 |
+
|
332 |
+
category = st.selectbox(
|
333 |
+
"Category",
|
334 |
+
options=category_ids,
|
335 |
+
format_func=lambda x: category_names[x],
|
336 |
+
help="Category to analyze",
|
337 |
+
index=category_ids.index("en:yogurts"),
|
338 |
+
)
|
339 |
+
drop_uncomplete = st.checkbox(
|
340 |
+
"Drop uncomplete",
|
341 |
+
value=True,
|
342 |
+
help="Drop all products that don't have complete packaging "
|
343 |
+
"(i.e weight and number of units for all elements)",
|
344 |
+
)
|
345 |
+
df = get_base_df(drop_uncomplete=drop_uncomplete)
|
346 |
+
display_charts(df, category)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests==2.28.1
|
2 |
+
streamlit==1.25.0
|
3 |
+
pandas==2.0.3
|
4 |
+
numpy==1.25.1
|
5 |
+
matplotlib==3.7.2
|
6 |
+
seaborn==0.12.2
|