Spaces:

Marylene
/

OpenFoodFactsAgent_POC

Sleeping

App Files Files Community

Marylene commited on Sep 22

Commit

7c6ee39

verified ·

1 Parent(s): 7622cd6

Try Space push

Browse files

Files changed (5) hide show

.gitignore +11 -0
README.md +8 -12
app.py +5 -0
quick_deploy_agent.py +322 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.venv/
+__pycache__/
+*.ipynb_checkpoints/
+.cache/
+data/
+models/
+outputs/
+node_modules/
+*.pt
+*.bin
+*.ckpt

README.md CHANGED Viewed

@@ -1,12 +1,8 @@
----
-title: OpenFoodFactsAgent POC
-emoji: 💻
-colorFrom: gray
-colorTo: pink
-sdk: gradio
-sdk_version: 5.46.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: OpenFoodFactsAgent (COICOP)
+emoji: 🧀
+sdk: gradio
+app_file: app.py
+python_version: "3.10"
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from smolagents import GradioUI
+from quick_deploy_agent import build_agent
+agent = build_agent()
+demo = GradioUI(agent).create_app()

quick_deploy_agent.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# quickstart_agent.py
+from __future__ import annotations
+import json, re, unicodedata, ast
+from typing import List, Dict, Any, Optional
+import requests
+from smolagents import Tool, CodeAgent, InferenceClientModel
+from sentence_transformers import SentenceTransformer, util
+# ---- Mini référentiel COICOP (démo) ----
+COICOP_ITEMS = [
+    {"code": "01.1.4.5.1", "label": "Laits caillés, fromage blanc, petites crèmes fromagères"},
+    {"code": "01.1.4.5.2", "label": "Fromage à pâte molle et à pâte persillée"},
+    {"code": "01.1.4.5.3", "label": "Fromage à pâte pressée"},
+    {"code": "01.1.4.5.4", "label": "Fromage de chèvre"},
+    {"code": "01.1.4.5.5", "label": "Fromages fondus, râpés, portions"},
+    {"code": "01.1.1.4", "label": "Pain"},
+    {"code": "01.1.1.1", "label": "Riz"},
+    {"code": "01.1.1.3", "label": "Pâtes, couscous et produits similaires"},
+]
+def normalize_txt(s: str) -> str:
+    if not s: return ""
+    s = s.upper()
+    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+    s = re.sub(r"[^A-Z0-9% ]+", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def ean_check_digit_ok(ean: str) -> bool:
+    digits = re.sub(r"\D", "", ean)
+    if len(digits) not in (8, 12, 13, 14): return False
+    total = 0
+    for i, ch in enumerate(reversed(digits[:-1]), start=1):
+        n = int(ch); total += n * (3 if i % 2 == 1 else 1)
+    check = (10 - (total % 10)) % 10
+    return check == int(digits[-1])
+# ---- ValidateEANTool : tout en local dans forward ----
+class ValidateEANTool(Tool):
+    name, description = "validate_ean", "Valide un EAN/GTIN (clé GS1)."
+    inputs = {"ean": {"type": "string", "description": "Code EAN/GTIN (8/12/13/14 chiffres)."}}
+    output_type = "string"
+    def forward(self, ean: str) -> str:
+        import json, re
+        digits = re.sub(r"\D", "", ean or "")
+        if len(digits) not in (8, 12, 13, 14):
+            return json.dumps({"valid": False, "normalized": digits})
+        total = 0
+        for i, ch in enumerate(reversed(digits[:-1]), start=1):
+            n = int(ch); total += n * (3 if i % 2 == 1 else 1)
+        check = (10 - (total % 10)) % 10
+        return json.dumps({"valid": check == int(digits[-1]), "normalized": digits})
+# ---- OFFByEAN : imports internes + requirements  ----
+# ---- OFFByEAN : robuste (retries + v2 + fallback .net) ----
+class OFFByEAN(Tool):
+    name = "openfoodfacts_product_by_ean"
+    description = "Open Food Facts /api/v0|v2/product/{ean} (name, brands, categories...)."
+    inputs = {"ean": {"type": "string", "description": "EAN à interroger sur l'API OFF."}}
+    output_type = "string"
+    requirements = ["requests"]  # urllib3 est une dépendance de requests
+    def forward(self, ean: str) -> str:
+        import json, re, requests
+        from requests.adapters import HTTPAdapter
+        from urllib3.util.retry import Retry
+        code = re.sub(r"\D", "", ean or "")
+        if not code:
+            return json.dumps({"status": 0, "code": "", "error": "EAN vide"})
+        # Session HTTP avec retries (429/5xx) et UA explicite
+        sess = requests.Session()
+        sess.headers.update({
+            "User-Agent": "insee-coicop-agent/1.0",
+            "Accept": "application/json",
+        })
+        retry = Retry(
+            total=3,
+            backoff_factor=0.5,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=frozenset(["GET"]),
+            raise_on_status=False,
+        )
+        sess.mount("https://", HTTPAdapter(max_retries=retry))
+        # On tente v0, puis v2 (fields limités), puis le miroir .net
+        urls = [
+            f"https://world.openfoodfacts.org/api/v0/product/{code}.json",
+            f"https://world.openfoodfacts.org/api/v2/product/{code}"
+            "?fields=product_name_fr,product_name,brands,categories_tags,"
+            "ingredients_text_fr,ingredients_text,stores,status,status_verbose",
+            f"https://world.openfoodfacts.net/api/v0/product/{code}.json",
+        ]
+        last_err = None
+        for u in urls:
+            try:
+                r = sess.get(u, timeout=15)
+                if not r.ok:
+                    last_err = f"HTTP {r.status_code}"
+                    continue
+                data = r.json()
+                # v0 : status==1 ; v2 : parfois status absent mais product présent
+                product = data.get("product")
+                status = data.get("status", 1 if product else 0)
+                if status == 1 or product:
+                    p = product or {}
+                    out = {
+                        "status": status,
+                        "code": code,
+                        "product_name": p.get("product_name_fr") or p.get("product_name"),
+                        "brands": p.get("brands"),
+                        "categories_tags": (
+                            p.get("categories_tags")
+                            or p.get("categories_tags_fr")
+                            or p.get("categories")
+                        ),
+                        "ingredients_text": p.get("ingredients_text_fr") or p.get("ingredients_text"),
+                        "stores": p.get("stores"),
+                    }
+                    return json.dumps(out)
+            except Exception as e:
+                last_err = str(e)
+        return json.dumps({"status": 0, "code": code, "error": last_err or "not found"})
+# ---- RegexCOICOP : normalisation locale + regex précompilées ----
+class RegexCOICOP(Tool):
+    name, description = "coicop_regex_rules", "Règles regex → candidats COICOP."
+    inputs = {"text": {"type": "string", "description": "Libellé produit (texte libre) à analyser."}}
+    output_type = "string"
+    # précompile ici pour rester autonome
+    import re as _re
+    SOFT = _re.compile(r"(?:\b|^)(?:CAMEMB(?:ERT)?|BRIE|COULOMMI(?:ERS?)?|BLEU|ROQUEFORT|GORGONZ(?:OLA)?|REBLOCHON|MUNSTER)(?:\b|$)")
+    PRESS = _re.compile(r"(?:\b|^)(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)(?:\b|$)")
+    GOAT  = _re.compile(r"(?:\b|^)(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)(?:\b|$)")
+    PROC  = _re.compile(r"(?:\b|^)(FONDU(?:ES?)?|FROMAGE FONDU|TOASTINETTES?|VACHE QUI RIT|KIRI|CARRE FRAIS|CARR[ÉE] FRAIS|PORTIONS?)(?:\b|$)|\bRAP[ÉE]?\b")
+    @staticmethod
+    def _normalize_txt(s: str) -> str:
+        import unicodedata, re
+        if not s: return ""
+        s = s.upper()
+        s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+        s = re.sub(r"[^A-Z0-9% ]+", " ", s)
+        return re.sub(r"\s+", " ", s).strip()
+    def forward(self, text: str) -> str:
+        import json, re
+        s = self._normalize_txt(text); c=[]
+        if self.SOFT.search(s): c.append({"code":"01.1.4.5.2","why":"pâte molle/persillée","score":0.95})
+        if self.PRESS.search(s): c.append({"code":"01.1.4.5.3","why":"pâte pressée","score":0.90})
+        if self.GOAT.search(s):  c.append({"code":"01.1.4.5.4","why":"chèvre","score":0.90})
+        if self.PROC.search(s):  c.append({"code":"01.1.4.5.5","why":"fondu/râpé/portions","score":0.85})
+        if not c and re.search(r"\bFROMAGE\b", s): c.append({"code":"01.1.4.5","why":"générique fromage/laits caillés","score":0.6})
+        return json.dumps({"candidates": c})
+# ---- OFFtoCOICOP : normalisation locale + regex règles ----
+class OFFtoCOICOP(Tool):
+    name, description = "map_off_to_coicop", "Mappe catégories OFF vers COICOP."
+    inputs = {
+        "product_name":    {"type":"string", "description":"Nom produit OFF (fr/en).", "nullable": True},
+        "categories_tags": {"type":"array",  "description":"Liste OFF categories_tags.", "nullable": True},
+        "ingredients_text":{"type":"string","description":"Texte ingrédients.", "nullable": True},
+    }
+    output_type="string"
+    import re as _re
+    RULES = [
+        (_re.compile(r"\b(CAMEMBERT|BRIE|COULOMMIERS|BLUE CHEESE|ROQUEFORT|GORGONZOLA|MUNSTER|REBLOCHON)\b"), ("01.1.4.5.2",0.95,"OFF: pâte molle/persillée")),
+        (_re.compile(r"\b(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)\b"), ("01.1.4.5.3",0.90,"OFF: pâte pressée")),
+        (_re.compile(r"\b(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)\b"), ("01.1.4.5.4",0.90,"OFF: chèvre")),
+        (_re.compile(r"\b(FONDU|FONDUES?|RAPE|RÂPE|PORTIONS?|KIRI|VACHE QUI RIT|CARRE FRAIS|CARR[ÉE] FRAIS)\b"), ("01.1.4.5.5",0.85,"OFF: fondu/rapé/portions")),
+    ]
+    @staticmethod
+    def _normalize_txt(s: str) -> str:
+        import unicodedata, re
+        if not s: return ""
+        s = s.upper()
+        s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+        s = re.sub(r"[^A-Z0-9% ]+", " ", s)
+        return re.sub(r"\s+", " ", s).strip()
+    def forward(self, product_name=None, categories_tags=None, ingredients_text=None) -> str:
+        import json
+        text = " ".join([t for t in [
+            self._normalize_txt(product_name or ""),
+            self._normalize_txt(" ".join(categories_tags or [])),
+            self._normalize_txt(ingredients_text or "")
+        ] if t])
+        c=[]
+        for rx,(code,score,why) in self.RULES:
+            if rx.search(text): c.append({"code":code,"why":why,"score":score})
+        return json.dumps({"candidates": c})
+# ---- SemSim : COICOP embarqué + import lazy du modèle ----
+class SemSim(Tool):
+    name, description = "coicop_semantic_similarity", "Embeddings → top-k COICOP."
+    inputs = {"text":{"type":"string","description":"Texte libellé"},
+              "topk":{"type":"integer","description":"Nombre de candidats (défaut 5)","nullable":True}}
+    output_type = "string"
+    # packages nécessaires côté Hub
+    requirements = ["sentence_transformers", "torch"]
+    # mini référentiel embarqué pour l'export Hub
+    COICOP_ITEMS = [
+        {"code": "01.1.4.5.1", "label": "Laits caillés, fromage blanc, petites crèmes fromagères"},
+        {"code": "01.1.4.5.2", "label": "Fromage à pâte molle et à pâte persillée"},
+        {"code": "01.1.4.5.3", "label": "Fromage à pâte pressée"},
+        {"code": "01.1.4.5.4", "label": "Fromage de chèvre"},
+        {"code": "01.1.4.5.5", "label": "Fromages fondus, râpés, portions"},
+        {"code": "01.1.1.4", "label": "Pain"},
+        {"code": "01.1.1.1", "label": "Riz"},
+        {"code": "01.1.1.3", "label": "Pâtes, couscous et produits similaires"},
+    ]
+    @staticmethod
+    def _normalize_txt(s: str) -> str:
+        import unicodedata, re
+        if not s: return ""
+        s = s.upper()
+        s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+        s = re.sub(r"[^A-Z0-9% ]+", " ", s)
+        return re.sub(r"\s+", " ", s).strip()
+    def forward(self, text: str, topk: int = 5) -> str:
+        import json
+        from sentence_transformers import SentenceTransformer, util
+        # lazy init pour la compat hub
+        if not hasattr(self, "_model"):
+            self._model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        q = self._normalize_txt(text)
+        q_emb = self._model.encode([q], normalize_embeddings=True)
+        labels = [f"{it['code']} {it['label']}" for it in self.COICOP_ITEMS]
+        L = self._model.encode(labels, normalize_embeddings=True)
+        sims = util.cos_sim(q_emb, L).tolist()[0]
+        ranked = sorted(
+            [{"code": self.COICOP_ITEMS[i]["code"], "label": self.COICOP_ITEMS[i]["label"], "score": float(sims[i])}
+             for i in range(len(self.COICOP_ITEMS))],
+            key=lambda x: x["score"], reverse=True
+        )
+        return json.dumps({"candidates": ranked[:max(1,int(topk))]})
+# ---- Resolve : import local json ----
+class Resolve(Tool):
+    name, description = "resolve_coicop_candidates", "Fusionne candidats → choix final + alternatives + explication."
+    inputs = {"json_lists": {"type":"array","description":"Liste de JSON (str) d'autres tools."},
+              "topn":{"type":"integer","description":"Nb d'alternatives (défaut 3)","nullable":True}}
+    output_type = "string"
+    def forward(self, json_lists, topn: int = 3) -> str:
+        import json
+        from typing import Dict, Any
+        bucket: Dict[str, Dict[str, Any]] = {}
+        for s in json_lists:
+            data = json.loads(s) if s else {}
+            for c in data.get("candidates", []):
+                code = c["code"]; score = float(c.get("score", 0.0))
+                why = c.get("why", "") or c.get("label", "")
+                if code not in bucket:
+                    bucket[code] = {"code":code,"score":score,"votes":1,"evidences":[why] if why else []}
+                else:
+                    bucket[code]["score"] = max(bucket[code]["score"], score)
+                    bucket[code]["votes"] += 1
+                    if why: bucket[code]["evidences"].append(why)
+        for v in bucket.values():
+            v["score_final"] = v["score"] + 0.05*(v["votes"]-1)
+        ranked = sorted(bucket.values(), key=lambda x: x["score_final"], reverse=True)
+        if not ranked: return json.dumps({"final": None, "alternatives": [], "explanation":"Aucun candidat"})
+        final = ranked[0]; alts = ranked[1:1+max(0,int(topn))]
+        exp = f"Choix {final['code']} (score {final['score_final']:.2f}) – votes={final['votes']} – raisons: {', '.join(sorted(set(final['evidences'])))}"
+        return json.dumps({"final": final, "alternatives": alts, "explanation": exp})
+def build_agent(model_id: str | None = None) -> CodeAgent:
+    model_id = model_id or "Qwen/Qwen2.5-Coder-7B-Instruct"  # léger pour tester
+    agent = CodeAgent(
+        tools=[ValidateEANTool(), OFFByEAN(), RegexCOICOP(), OFFtoCOICOP(), SemSim(), Resolve()],
+        model=InferenceClientModel(model_id=model_id),
+        add_base_tools=False,
+        max_steps=6,
+        verbosity_level=2,
+    )
+    return agent
+def parse_result(res):
+    if isinstance(res, dict): return res
+    try: return ast.literal_eval(res)
+    except Exception: return {"raw": res}
+if __name__ == "__main__":
+    # Remplace par les vraies données si possible - uniquement du test
+    ean = "3256221112345"  # EAN fictif (peut ne pas exister sur OFF)
+    label = "Camembert au lait cru AOP 250g - ALDI"
+    agent = build_agent()
+    task = f"""
+    Classe ce produit en COICOP:
+    EAN: {ean}
+    Libellé: {label}
+    Pipeline:
+    1) validate_ean(ean)
+    2) openfoodfacts_product_by_ean(ean)  # si OFF ne trouve pas, on s'appuie sur regex + embeddings
+    3) map_off_to_coicop(product_name, categories_tags, ingredients_text)
+    4) coicop_regex_rules(text=libellé)
+    5) coicop_semantic_similarity(text=libellé, topk=5)
+    6) resolve_coicop_candidates([...], topn=3)
+    Attend un JSON final.
+    """
+    out = agent.run(task)
+    print(parse_result(out))

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+smolagents
+huggingface_hub
+gradio
+requests
+sentence-transformers
+torch