Raphaël Bournhonesque commited on
Commit
71990fd
1 Parent(s): 075b7dd

first commit

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +346 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Packaging Analysis
3
- emoji: 🚀
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: streamlit
7
- sdk_version: 1.26.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: Packaging Analysis
3
+ emoji:
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: streamlit
7
+ sdk_version: 1.25.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import requests
6
+ import seaborn as sns
7
+ import streamlit as st
8
+ from matplotlib import pyplot as plt
9
+
10
+ COLUMN_DISPLAY = [
11
+ "code",
12
+ "shape",
13
+ "number_of_units",
14
+ "material",
15
+ "weight",
16
+ "is_main_component",
17
+ "weight_all_units",
18
+ "quantity_per_unit",
19
+ "product_quantity",
20
+ ]
21
+
22
+
23
+ @st.cache_data
24
+ def get_category_names():
25
+ category_names = {}
26
+ items = requests.get(
27
+ "https://static.openfoodfacts.org/data/taxonomies/categories.json"
28
+ ).json()
29
+ for id_, item in items.items():
30
+ category_names[id_] = item["name"].get("en", id_)
31
+ return category_names
32
+
33
+
34
+ @st.cache_data
35
+ def get_base_df(drop_uncomplete: bool = True):
36
+ df = pd.read_csv(
37
+ "https://world.openfoodfacts.org/data/packagings.packagings-with-weights.csv",
38
+ delimiter="\t",
39
+ dtype={"categories_tags": str, "code": str},
40
+ )
41
+ # Don't keep elements with unknown categories
42
+ df = df.drop(df.categories_tags[df.categories_tags.isnull()].index)
43
+
44
+ # Fetch weight specified by the producer first and fallback on user measured weight otherwise
45
+ df["weight"] = [
46
+ (weight_specified if not math.isnan(weight_specified) else weight_measured)
47
+ for (weight_specified, weight_measured) in zip(
48
+ df.weight_specified, df.weight_measured
49
+ )
50
+ ]
51
+
52
+ if drop_uncomplete:
53
+ # Drop incomplete products, that don't have weights or number of units for all elements
54
+ df["missing_data"] = np.isnan(df.number_of_units) | np.isnan(df.weight)
55
+ missing_data_group_by = (
56
+ df.loc[:, ["code", "missing_data"]].groupby("code").any()
57
+ )
58
+ df["missing_data"] = missing_data_group_by.loc[
59
+ df.code.values, "missing_data"
60
+ ].values
61
+ df = df.drop(df.missing_data[df.missing_data].index).drop(
62
+ "missing_data", axis=1
63
+ )
64
+ else:
65
+ df = df.drop(df.number_of_units[np.isnan(df.number_of_units)].index)
66
+
67
+ # We compute the packaging weight per 100g of product
68
+ df["weight_per_100g_of_product"] = df.weight * 100 / df.product_quantity
69
+ # We don't need weight_measured and weight_specified anymore
70
+ df.drop(["weight_measured", "weight_specified"], axis="columns")
71
+
72
+ # weight_all_units is the combined weight of all units of this element
73
+ df["weight_all_units"] = df.weight * df.number_of_units
74
+ # This is used to find the main element of the product (the one with the largest weight)
75
+ max_group_by = (
76
+ df.loc[:, ["code", "weight_all_units"]]
77
+ .groupby("code")
78
+ .max()
79
+ # Max over a a group of NaN produces NaN, replace by -1 to prevent setting
80
+ # as main element an element with NaN weight
81
+ .fillna(-1)
82
+ )
83
+ max_weight = max_group_by.loc[df.code.values, "weight_all_units"].values
84
+ df["is_main_component"] = df.weight_all_units == max_weight
85
+ sum_group_by = df.loc[:, ["code", "weight_all_units"]].groupby("code").sum()
86
+ df["percent_total_weight"] = (
87
+ df.weight_all_units
88
+ * 100
89
+ / sum_group_by.loc[df.code.values, "weight_all_units"].values
90
+ )
91
+ return df
92
+
93
+
94
+ def reset_plotting_context():
95
+ sns.set_context("paper", font_scale=0.8)
96
+
97
+
98
+ def display_ratio_charts(
99
+ df: pd.DataFrame, group_name: str = "parent_material", display_by_row: bool = False
100
+ ):
101
+ df_with_weight_ratio = df[~df.weight_per_100g_of_product.isnull()]
102
+ st.markdown(f"{len(df_with_weight_ratio)} products.")
103
+ order = df_with_weight_ratio[group_name].value_counts().index
104
+ hue_order = df_with_weight_ratio["shape"].value_counts().index
105
+ kwargs = (
106
+ {"row": group_name, "row_order": order, "sharex": False}
107
+ if display_by_row
108
+ else {"col": group_name, "col_order": order}
109
+ )
110
+ with sns.plotting_context("paper", font_scale=0.6):
111
+ g = sns.FacetGrid(
112
+ df_with_weight_ratio, hue="shape", hue_order=hue_order, **kwargs
113
+ )
114
+ g.map(sns.swarmplot, "weight_per_100g_of_product", size=2.5, alpha=0.7)
115
+ axes = [ax[0] for ax in g.axes] if display_by_row else g.axes[0]
116
+ for ax, plot_name in zip(axes, order):
117
+ sns.violinplot(
118
+ data=df_with_weight_ratio[
119
+ df_with_weight_ratio[group_name] == plot_name
120
+ ],
121
+ x="weight_per_100g_of_product",
122
+ color=".9",
123
+ inner=None,
124
+ ax=ax,
125
+ )
126
+ ax.set_xlabel(None)
127
+ ax.xaxis.set_major_formatter(lambda x, pos: f"{x:.2f} g")
128
+ g.add_legend()
129
+ return g
130
+
131
+
132
+ def display_charts(df, target_category):
133
+ st.markdown(
134
+ """---
135
+
136
+ In this analysis, we drop all packaging elements with an unknown number of units."""
137
+ )
138
+ filtered_df = df.loc[df.categories_tags.str.contains(target_category).values, :]
139
+
140
+ if len(filtered_df) == 0:
141
+ st.markdown("No items")
142
+ return
143
+
144
+ main_component_df = filtered_df.loc[df.is_main_component, :]
145
+
146
+ st.markdown(
147
+ """The graph below shows the distribution of the total packaging weight,
148
+ by summing the weights of all packaging components (the number of units is taken into account)."""
149
+ )
150
+
151
+ sns.set_theme()
152
+ reset_plotting_context()
153
+
154
+ fig = plt.figure()
155
+ ax = fig.add_subplot(1, 1, 1)
156
+ weight_sum_df = filtered_df.loc[:, ["code", "weight"]].groupby("code").sum()
157
+ ax = sns.histplot(weight_sum_df, x="weight", ax=ax)
158
+ ax.set(
159
+ xlabel=f"Packaging weight (all elements) [n={len(weight_sum_df)}]",
160
+ ylabel="Count",
161
+ )
162
+ st.pyplot(fig, clear_figure=True)
163
+
164
+ st.markdown(
165
+ """## Main packaging element
166
+
167
+ The main element is the packaging element with the largest weight, obtained
168
+ by multiplying the element weight by the number of units."""
169
+ )
170
+ st.markdown(f"{len(main_component_df.code.unique())} products.")
171
+
172
+ shape_count_df = (
173
+ main_component_df.loc[:, ["code", "shape"]]
174
+ .groupby("shape", as_index=False)
175
+ .count()
176
+ .sort_values("code", ascending=False)
177
+ .rename({"code": "count"}, axis="columns")
178
+ )
179
+ shape_count_df["percent"] = (
180
+ shape_count_df["count"] * 100 / shape_count_df["count"].sum()
181
+ )
182
+ fig = plt.figure()
183
+ ax = fig.add_subplot(1, 1, 1)
184
+ sns.barplot(
185
+ data=shape_count_df, y="shape", x="percent", palette="pastel", orient="h", ax=ax
186
+ )
187
+ ax.bar_label(ax.containers[0], labels=shape_count_df["count"], label_type="center")
188
+ ax.set(xlabel="Percentage of main elements with shape (%)", ylabel="Shape")
189
+ st.pyplot(fig=fig, clear_figure=True)
190
+
191
+ material_count_df = (
192
+ main_component_df.loc[:, ["code", "parent_material"]]
193
+ .groupby("parent_material", as_index=False)
194
+ .count()
195
+ .sort_values("code", ascending=False)
196
+ .rename({"code": "count"}, axis="columns")
197
+ )
198
+ material_count_df["percent"] = (
199
+ material_count_df["count"] * 100 / shape_count_df["count"].sum()
200
+ )
201
+ fig = plt.figure()
202
+ ax = fig.add_subplot(1, 1, 1)
203
+ sns.barplot(
204
+ data=material_count_df,
205
+ ax=ax,
206
+ y="parent_material",
207
+ x="percent",
208
+ palette="pastel",
209
+ orient="h",
210
+ )
211
+ ax.bar_label(
212
+ ax.containers[0], labels=material_count_df["count"], label_type="center"
213
+ )
214
+ ax.set(xlabel="Percentage of main elements with material (%)", ylabel="Material")
215
+ st.pyplot(fig=fig, clear_figure=True)
216
+
217
+ fig = plt.figure()
218
+ ax = fig.add_subplot(1, 1, 1)
219
+ sns.violinplot(
220
+ data=main_component_df,
221
+ y="parent_material",
222
+ x="weight",
223
+ ax=ax,
224
+ inner=None,
225
+ orient="h",
226
+ color="k",
227
+ )
228
+ sns.swarmplot(
229
+ data=main_component_df,
230
+ y="parent_material",
231
+ x="weight",
232
+ ax=ax,
233
+ palette="colorblind",
234
+ hue="shape",
235
+ )
236
+ ax.set_xlabel("Main element weight - one unit (g)")
237
+ ax.set_ylabel("Material")
238
+ st.pyplot(fig=fig, clear_figure=True)
239
+
240
+ st.markdown(
241
+ """### Weight of the main packaging element per 100g of product
242
+
243
+ We divide the weight of the main packaging element with the product weight
244
+ and multiply it by 100, to get the weight per 100 g of product.
245
+
246
+ We ignore in this analysis products without product weight."""
247
+ )
248
+ g = display_ratio_charts(main_component_df, display_by_row=True)
249
+ st.pyplot(fig=g.figure, clear_figure=True)
250
+
251
+ if (main_component_df.parent_material == "en:plastic").any():
252
+ plastic_df = main_component_df[
253
+ main_component_df.parent_material == "en:plastic"
254
+ ]
255
+ st.markdown(
256
+ """## Plastic-specific analysis
257
+ This product category contains main elements with plastic, so we display an analysis of the type of plastic of the main element."""
258
+ )
259
+
260
+ fig = plt.figure()
261
+ ax = fig.add_subplot(1, 1, 1)
262
+ sns.violinplot(
263
+ data=plastic_df,
264
+ y="material",
265
+ x="weight",
266
+ ax=ax,
267
+ inner=None,
268
+ orient="h",
269
+ color="k",
270
+ )
271
+ sns.swarmplot(
272
+ data=plastic_df,
273
+ y="material",
274
+ x="weight",
275
+ ax=ax,
276
+ palette="colorblind",
277
+ hue="shape",
278
+ )
279
+ ax.set_xlabel("Main element weight (plastic only) - one unit (g)")
280
+ ax.set_ylabel("Plastic material")
281
+ sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
282
+ st.pyplot(fig=fig, clear_figure=True)
283
+
284
+ st.markdown("### Weight of the main packaging element per 100g of product")
285
+ g = display_ratio_charts(plastic_df, group_name="material", display_by_row=True)
286
+ st.pyplot(fig=g.figure, clear_figure=True)
287
+
288
+ fig = plt.figure()
289
+ ax = fig.add_subplot(1, 1, 1)
290
+ sns.swarmplot(
291
+ data=main_component_df,
292
+ y="parent_material",
293
+ x="percent_total_weight",
294
+ palette="colorblind",
295
+ hue="shape",
296
+ ax=ax,
297
+ )
298
+ sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
299
+ ax.set(xlabel="Percent of total packaging weight (%)", ylabel="Material")
300
+ st.pyplot(fig=fig, clear_figure=True)
301
+
302
+ st.markdown("## Number of packaging elements")
303
+ total_num_units_df = (
304
+ filtered_df.loc[:, ["code", "number_of_units"]].groupby("code").sum()
305
+ )
306
+ fig = plt.figure()
307
+ ax = fig.add_subplot(1, 1, 1)
308
+ sns.histplot(total_num_units_df, x="number_of_units", ax=ax)
309
+ ax.set(
310
+ xlabel="Distribution of the total number of packaging elements", ylabel="Count"
311
+ )
312
+ st.pyplot(fig=fig, clear_figure=True)
313
+
314
+ with st.expander("Show data"):
315
+ st.markdown("Weight sum dataframe")
316
+ st.write(weight_sum_df)
317
+ st.markdown(f"Dataframe: category={target_category}")
318
+ st.write(filtered_df.loc[:, COLUMN_DISPLAY])
319
+ st.markdown("Number of units dataframe")
320
+ st.write(total_num_units_df)
321
+
322
+
323
+ st.title("Packaging analysis")
324
+ st.markdown(
325
+ """You can explore the Open Food Facts packaging data using this demo.
326
+ Start by providing a category to analyze."""
327
+ )
328
+
329
+ category_names = get_category_names()
330
+ category_ids = list(category_names.keys())
331
+
332
+ category = st.selectbox(
333
+ "Category",
334
+ options=category_ids,
335
+ format_func=lambda x: category_names[x],
336
+ help="Category to analyze",
337
+ index=category_ids.index("en:yogurts"),
338
+ )
339
+ drop_uncomplete = st.checkbox(
340
+ "Drop uncomplete",
341
+ value=True,
342
+ help="Drop all products that don't have complete packaging "
343
+ "(i.e weight and number of units for all elements)",
344
+ )
345
+ df = get_base_df(drop_uncomplete=drop_uncomplete)
346
+ display_charts(df, category)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ requests==2.28.1
2
+ streamlit==1.25.0
3
+ pandas==2.0.3
4
+ numpy==1.25.1
5
+ matplotlib==3.7.2
6
+ seaborn==0.12.2