Spaces:

openfoodfacts
/

packaging-analysis

Running

App Files Files Community

Raphaël Bournhonesque commited on Sep 4, 2023

Commit

71990fd

•

1 Parent(s): 075b7dd

first commit

Browse files

Files changed (3) hide show

README.md +4 -4
app.py +346 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Packaging Analysis
-emoji: 🚀
-colorFrom: green
-colorTo: indigo
 sdk: streamlit
-sdk_version: 1.26.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: Packaging Analysis
+emoji: ⚡
+colorFrom: yellow
+colorTo: green
 sdk: streamlit
+sdk_version: 1.25.0
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import math
+import numpy as np
+import pandas as pd
+import requests
+import seaborn as sns
+import streamlit as st
+from matplotlib import pyplot as plt
+COLUMN_DISPLAY = [
+    "code",
+    "shape",
+    "number_of_units",
+    "material",
+    "weight",
+    "is_main_component",
+    "weight_all_units",
+    "quantity_per_unit",
+    "product_quantity",
+]
+@st.cache_data
+def get_category_names():
+    category_names = {}
+    items = requests.get(
+        "https://static.openfoodfacts.org/data/taxonomies/categories.json"
+    ).json()
+    for id_, item in items.items():
+        category_names[id_] = item["name"].get("en", id_)
+    return category_names
+@st.cache_data
+def get_base_df(drop_uncomplete: bool = True):
+    df = pd.read_csv(
+        "https://world.openfoodfacts.org/data/packagings.packagings-with-weights.csv",
+        delimiter="\t",
+        dtype={"categories_tags": str, "code": str},
+    )
+    # Don't keep elements with unknown categories
+    df = df.drop(df.categories_tags[df.categories_tags.isnull()].index)
+    # Fetch weight specified by the producer first and fallback on user measured weight otherwise
+    df["weight"] = [
+        (weight_specified if not math.isnan(weight_specified) else weight_measured)
+        for (weight_specified, weight_measured) in zip(
+            df.weight_specified, df.weight_measured
+        )
+    ]
+    if drop_uncomplete:
+        # Drop incomplete products, that don't have weights or number of units for all elements
+        df["missing_data"] = np.isnan(df.number_of_units) | np.isnan(df.weight)
+        missing_data_group_by = (
+            df.loc[:, ["code", "missing_data"]].groupby("code").any()
+        )
+        df["missing_data"] = missing_data_group_by.loc[
+            df.code.values, "missing_data"
+        ].values
+        df = df.drop(df.missing_data[df.missing_data].index).drop(
+            "missing_data", axis=1
+        )
+    else:
+        df = df.drop(df.number_of_units[np.isnan(df.number_of_units)].index)
+    # We compute the packaging weight per 100g of product
+    df["weight_per_100g_of_product"] = df.weight * 100 / df.product_quantity
+    # We don't need weight_measured and weight_specified anymore
+    df.drop(["weight_measured", "weight_specified"], axis="columns")
+    # weight_all_units is the combined weight of all units of this element
+    df["weight_all_units"] = df.weight * df.number_of_units
+    # This is used to find the main element of the product (the one with the largest weight)
+    max_group_by = (
+        df.loc[:, ["code", "weight_all_units"]]
+        .groupby("code")
+        .max()
+        # Max over a a group of NaN produces NaN, replace by -1 to prevent setting
+        # as main element an element with NaN weight
+        .fillna(-1)
+    )
+    max_weight = max_group_by.loc[df.code.values, "weight_all_units"].values
+    df["is_main_component"] = df.weight_all_units == max_weight
+    sum_group_by = df.loc[:, ["code", "weight_all_units"]].groupby("code").sum()
+    df["percent_total_weight"] = (
+        df.weight_all_units
+        * 100
+        / sum_group_by.loc[df.code.values, "weight_all_units"].values
+    )
+    return df
+def reset_plotting_context():
+    sns.set_context("paper", font_scale=0.8)
+def display_ratio_charts(
+    df: pd.DataFrame, group_name: str = "parent_material", display_by_row: bool = False
+):
+    df_with_weight_ratio = df[~df.weight_per_100g_of_product.isnull()]
+    st.markdown(f"{len(df_with_weight_ratio)} products.")
+    order = df_with_weight_ratio[group_name].value_counts().index
+    hue_order = df_with_weight_ratio["shape"].value_counts().index
+    kwargs = (
+        {"row": group_name, "row_order": order, "sharex": False}
+        if display_by_row
+        else {"col": group_name, "col_order": order}
+    )
+    with sns.plotting_context("paper", font_scale=0.6):
+        g = sns.FacetGrid(
+            df_with_weight_ratio, hue="shape", hue_order=hue_order, **kwargs
+        )
+        g.map(sns.swarmplot, "weight_per_100g_of_product", size=2.5, alpha=0.7)
+        axes = [ax[0] for ax in g.axes] if display_by_row else g.axes[0]
+        for ax, plot_name in zip(axes, order):
+            sns.violinplot(
+                data=df_with_weight_ratio[
+                    df_with_weight_ratio[group_name] == plot_name
+                ],
+                x="weight_per_100g_of_product",
+                color=".9",
+                inner=None,
+                ax=ax,
+            )
+            ax.set_xlabel(None)
+            ax.xaxis.set_major_formatter(lambda x, pos: f"{x:.2f} g")
+        g.add_legend()
+    return g
+def display_charts(df, target_category):
+    st.markdown(
+        """---
+In this analysis, we drop all packaging elements with an unknown number of units."""
+    )
+    filtered_df = df.loc[df.categories_tags.str.contains(target_category).values, :]
+    if len(filtered_df) == 0:
+        st.markdown("No items")
+        return
+    main_component_df = filtered_df.loc[df.is_main_component, :]
+    st.markdown(
+        """The graph below shows the distribution of the total packaging weight,
+by summing the weights of all packaging components (the number of units is taken into account)."""
+    )
+    sns.set_theme()
+    reset_plotting_context()
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    weight_sum_df = filtered_df.loc[:, ["code", "weight"]].groupby("code").sum()
+    ax = sns.histplot(weight_sum_df, x="weight", ax=ax)
+    ax.set(
+        xlabel=f"Packaging weight (all elements) [n={len(weight_sum_df)}]",
+        ylabel="Count",
+    )
+    st.pyplot(fig, clear_figure=True)
+    st.markdown(
+        """## Main packaging element
+The main element is the packaging element with the largest weight, obtained
+by multiplying the element weight by the number of units."""
+    )
+    st.markdown(f"{len(main_component_df.code.unique())} products.")
+    shape_count_df = (
+        main_component_df.loc[:, ["code", "shape"]]
+        .groupby("shape", as_index=False)
+        .count()
+        .sort_values("code", ascending=False)
+        .rename({"code": "count"}, axis="columns")
+    )
+    shape_count_df["percent"] = (
+        shape_count_df["count"] * 100 / shape_count_df["count"].sum()
+    )
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    sns.barplot(
+        data=shape_count_df, y="shape", x="percent", palette="pastel", orient="h", ax=ax
+    )
+    ax.bar_label(ax.containers[0], labels=shape_count_df["count"], label_type="center")
+    ax.set(xlabel="Percentage of main elements with shape (%)", ylabel="Shape")
+    st.pyplot(fig=fig, clear_figure=True)
+    material_count_df = (
+        main_component_df.loc[:, ["code", "parent_material"]]
+        .groupby("parent_material", as_index=False)
+        .count()
+        .sort_values("code", ascending=False)
+        .rename({"code": "count"}, axis="columns")
+    )
+    material_count_df["percent"] = (
+        material_count_df["count"] * 100 / shape_count_df["count"].sum()
+    )
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    sns.barplot(
+        data=material_count_df,
+        ax=ax,
+        y="parent_material",
+        x="percent",
+        palette="pastel",
+        orient="h",
+    )
+    ax.bar_label(
+        ax.containers[0], labels=material_count_df["count"], label_type="center"
+    )
+    ax.set(xlabel="Percentage of main elements with material (%)", ylabel="Material")
+    st.pyplot(fig=fig, clear_figure=True)
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    sns.violinplot(
+        data=main_component_df,
+        y="parent_material",
+        x="weight",
+        ax=ax,
+        inner=None,
+        orient="h",
+        color="k",
+    )
+    sns.swarmplot(
+        data=main_component_df,
+        y="parent_material",
+        x="weight",
+        ax=ax,
+        palette="colorblind",
+        hue="shape",
+    )
+    ax.set_xlabel("Main element weight - one unit (g)")
+    ax.set_ylabel("Material")
+    st.pyplot(fig=fig, clear_figure=True)
+    st.markdown(
+        """### Weight of the main packaging element per 100g of product
+We divide the weight of the main packaging element with the product weight
+and multiply it by 100, to get the weight per 100 g of product.
+We ignore in this analysis products without product weight."""
+    )
+    g = display_ratio_charts(main_component_df, display_by_row=True)
+    st.pyplot(fig=g.figure, clear_figure=True)
+    if (main_component_df.parent_material == "en:plastic").any():
+        plastic_df = main_component_df[
+            main_component_df.parent_material == "en:plastic"
+        ]
+        st.markdown(
+            """## Plastic-specific analysis
+This product category contains main elements with plastic, so we display an analysis of the type of plastic of the main element."""
+        )
+        fig = plt.figure()
+        ax = fig.add_subplot(1, 1, 1)
+        sns.violinplot(
+            data=plastic_df,
+            y="material",
+            x="weight",
+            ax=ax,
+            inner=None,
+            orient="h",
+            color="k",
+        )
+        sns.swarmplot(
+            data=plastic_df,
+            y="material",
+            x="weight",
+            ax=ax,
+            palette="colorblind",
+            hue="shape",
+        )
+        ax.set_xlabel("Main element weight (plastic only) - one unit (g)")
+        ax.set_ylabel("Plastic material")
+        sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
+        st.pyplot(fig=fig, clear_figure=True)
+        st.markdown("### Weight of the main packaging element per 100g of product")
+        g = display_ratio_charts(plastic_df, group_name="material", display_by_row=True)
+        st.pyplot(fig=g.figure, clear_figure=True)
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    sns.swarmplot(
+        data=main_component_df,
+        y="parent_material",
+        x="percent_total_weight",
+        palette="colorblind",
+        hue="shape",
+        ax=ax,
+    )
+    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
+    ax.set(xlabel="Percent of total packaging weight (%)", ylabel="Material")
+    st.pyplot(fig=fig, clear_figure=True)
+    st.markdown("## Number of packaging elements")
+    total_num_units_df = (
+        filtered_df.loc[:, ["code", "number_of_units"]].groupby("code").sum()
+    )
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    sns.histplot(total_num_units_df, x="number_of_units", ax=ax)
+    ax.set(
+        xlabel="Distribution of the total number of packaging elements", ylabel="Count"
+    )
+    st.pyplot(fig=fig, clear_figure=True)
+    with st.expander("Show data"):
+        st.markdown("Weight sum dataframe")
+        st.write(weight_sum_df)
+        st.markdown(f"Dataframe: category={target_category}")
+        st.write(filtered_df.loc[:, COLUMN_DISPLAY])
+        st.markdown("Number of units dataframe")
+        st.write(total_num_units_df)
+st.title("Packaging analysis")
+st.markdown(
+    """You can explore the Open Food Facts packaging data using this demo.
+Start by providing a category to analyze."""
+)
+category_names = get_category_names()
+category_ids = list(category_names.keys())
+category = st.selectbox(
+    "Category",
+    options=category_ids,
+    format_func=lambda x: category_names[x],
+    help="Category to analyze",
+    index=category_ids.index("en:yogurts"),
+)
+drop_uncomplete = st.checkbox(
+    "Drop uncomplete",
+    value=True,
+    help="Drop all products that don't have complete packaging "
+    "(i.e weight and number of units for all elements)",
+)
+df = get_base_df(drop_uncomplete=drop_uncomplete)
+display_charts(df, category)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+requests==2.28.1
+streamlit==1.25.0
+pandas==2.0.3
+numpy==1.25.1
+matplotlib==3.7.2
+seaborn==0.12.2