Marylene commited on
Commit
7c6ee39
·
verified ·
1 Parent(s): 7622cd6

Try Space push

Browse files
Files changed (5) hide show
  1. .gitignore +11 -0
  2. README.md +8 -12
  3. app.py +5 -0
  4. quick_deploy_agent.py +322 -0
  5. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.ipynb_checkpoints/
4
+ .cache/
5
+ data/
6
+ models/
7
+ outputs/
8
+ node_modules/
9
+ *.pt
10
+ *.bin
11
+ *.ckpt
README.md CHANGED
@@ -1,12 +1,8 @@
1
- ---
2
- title: OpenFoodFactsAgent POC
3
- emoji: 💻
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.46.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: OpenFoodFactsAgent (COICOP)
3
+ emoji: 🧀
4
+ sdk: gradio
5
+ app_file: app.py
6
+ python_version: "3.10"
7
+ pinned: false
8
+ ---
 
 
 
 
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from smolagents import GradioUI
2
+ from quick_deploy_agent import build_agent
3
+
4
+ agent = build_agent()
5
+ demo = GradioUI(agent).create_app()
quick_deploy_agent.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # quickstart_agent.py
2
+ from __future__ import annotations
3
+ import json, re, unicodedata, ast
4
+ from typing import List, Dict, Any, Optional
5
+ import requests
6
+ from smolagents import Tool, CodeAgent, InferenceClientModel
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+ # ---- Mini référentiel COICOP (démo) ----
10
+ COICOP_ITEMS = [
11
+ {"code": "01.1.4.5.1", "label": "Laits caillés, fromage blanc, petites crèmes fromagères"},
12
+ {"code": "01.1.4.5.2", "label": "Fromage à pâte molle et à pâte persillée"},
13
+ {"code": "01.1.4.5.3", "label": "Fromage à pâte pressée"},
14
+ {"code": "01.1.4.5.4", "label": "Fromage de chèvre"},
15
+ {"code": "01.1.4.5.5", "label": "Fromages fondus, râpés, portions"},
16
+ {"code": "01.1.1.4", "label": "Pain"},
17
+ {"code": "01.1.1.1", "label": "Riz"},
18
+ {"code": "01.1.1.3", "label": "Pâtes, couscous et produits similaires"},
19
+ ]
20
+
21
+ def normalize_txt(s: str) -> str:
22
+ if not s: return ""
23
+ s = s.upper()
24
+ s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
25
+ s = re.sub(r"[^A-Z0-9% ]+", " ", s)
26
+ s = re.sub(r"\s+", " ", s).strip()
27
+ return s
28
+
29
+ def ean_check_digit_ok(ean: str) -> bool:
30
+ digits = re.sub(r"\D", "", ean)
31
+ if len(digits) not in (8, 12, 13, 14): return False
32
+ total = 0
33
+ for i, ch in enumerate(reversed(digits[:-1]), start=1):
34
+ n = int(ch); total += n * (3 if i % 2 == 1 else 1)
35
+ check = (10 - (total % 10)) % 10
36
+ return check == int(digits[-1])
37
+
38
+ # ---- ValidateEANTool : tout en local dans forward ----
39
+ class ValidateEANTool(Tool):
40
+ name, description = "validate_ean", "Valide un EAN/GTIN (clé GS1)."
41
+ inputs = {"ean": {"type": "string", "description": "Code EAN/GTIN (8/12/13/14 chiffres)."}}
42
+ output_type = "string"
43
+
44
+ def forward(self, ean: str) -> str:
45
+ import json, re
46
+ digits = re.sub(r"\D", "", ean or "")
47
+ if len(digits) not in (8, 12, 13, 14):
48
+ return json.dumps({"valid": False, "normalized": digits})
49
+ total = 0
50
+ for i, ch in enumerate(reversed(digits[:-1]), start=1):
51
+ n = int(ch); total += n * (3 if i % 2 == 1 else 1)
52
+ check = (10 - (total % 10)) % 10
53
+ return json.dumps({"valid": check == int(digits[-1]), "normalized": digits})
54
+
55
+
56
+ # ---- OFFByEAN : imports internes + requirements ----
57
+ # ---- OFFByEAN : robuste (retries + v2 + fallback .net) ----
58
+ class OFFByEAN(Tool):
59
+ name = "openfoodfacts_product_by_ean"
60
+ description = "Open Food Facts /api/v0|v2/product/{ean} (name, brands, categories...)."
61
+ inputs = {"ean": {"type": "string", "description": "EAN à interroger sur l'API OFF."}}
62
+ output_type = "string"
63
+ requirements = ["requests"] # urllib3 est une dépendance de requests
64
+
65
+ def forward(self, ean: str) -> str:
66
+ import json, re, requests
67
+ from requests.adapters import HTTPAdapter
68
+ from urllib3.util.retry import Retry
69
+
70
+ code = re.sub(r"\D", "", ean or "")
71
+ if not code:
72
+ return json.dumps({"status": 0, "code": "", "error": "EAN vide"})
73
+
74
+ # Session HTTP avec retries (429/5xx) et UA explicite
75
+ sess = requests.Session()
76
+ sess.headers.update({
77
+ "User-Agent": "insee-coicop-agent/1.0",
78
+ "Accept": "application/json",
79
+ })
80
+ retry = Retry(
81
+ total=3,
82
+ backoff_factor=0.5,
83
+ status_forcelist=[429, 500, 502, 503, 504],
84
+ allowed_methods=frozenset(["GET"]),
85
+ raise_on_status=False,
86
+ )
87
+ sess.mount("https://", HTTPAdapter(max_retries=retry))
88
+
89
+ # On tente v0, puis v2 (fields limités), puis le miroir .net
90
+ urls = [
91
+ f"https://world.openfoodfacts.org/api/v0/product/{code}.json",
92
+ f"https://world.openfoodfacts.org/api/v2/product/{code}"
93
+ "?fields=product_name_fr,product_name,brands,categories_tags,"
94
+ "ingredients_text_fr,ingredients_text,stores,status,status_verbose",
95
+ f"https://world.openfoodfacts.net/api/v0/product/{code}.json",
96
+ ]
97
+
98
+ last_err = None
99
+ for u in urls:
100
+ try:
101
+ r = sess.get(u, timeout=15)
102
+ if not r.ok:
103
+ last_err = f"HTTP {r.status_code}"
104
+ continue
105
+ data = r.json()
106
+ # v0 : status==1 ; v2 : parfois status absent mais product présent
107
+ product = data.get("product")
108
+ status = data.get("status", 1 if product else 0)
109
+ if status == 1 or product:
110
+ p = product or {}
111
+ out = {
112
+ "status": status,
113
+ "code": code,
114
+ "product_name": p.get("product_name_fr") or p.get("product_name"),
115
+ "brands": p.get("brands"),
116
+ "categories_tags": (
117
+ p.get("categories_tags")
118
+ or p.get("categories_tags_fr")
119
+ or p.get("categories")
120
+ ),
121
+ "ingredients_text": p.get("ingredients_text_fr") or p.get("ingredients_text"),
122
+ "stores": p.get("stores"),
123
+ }
124
+ return json.dumps(out)
125
+ except Exception as e:
126
+ last_err = str(e)
127
+
128
+ return json.dumps({"status": 0, "code": code, "error": last_err or "not found"})
129
+
130
+
131
+
132
+
133
+ # ---- RegexCOICOP : normalisation locale + regex précompilées ----
134
+ class RegexCOICOP(Tool):
135
+ name, description = "coicop_regex_rules", "Règles regex → candidats COICOP."
136
+ inputs = {"text": {"type": "string", "description": "Libellé produit (texte libre) à analyser."}}
137
+ output_type = "string"
138
+
139
+ # précompile ici pour rester autonome
140
+ import re as _re
141
+ SOFT = _re.compile(r"(?:\b|^)(?:CAMEMB(?:ERT)?|BRIE|COULOMMI(?:ERS?)?|BLEU|ROQUEFORT|GORGONZ(?:OLA)?|REBLOCHON|MUNSTER)(?:\b|$)")
142
+ PRESS = _re.compile(r"(?:\b|^)(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)(?:\b|$)")
143
+ GOAT = _re.compile(r"(?:\b|^)(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)(?:\b|$)")
144
+ PROC = _re.compile(r"(?:\b|^)(FONDU(?:ES?)?|FROMAGE FONDU|TOASTINETTES?|VACHE QUI RIT|KIRI|CARRE FRAIS|CARR[ÉE] FRAIS|PORTIONS?)(?:\b|$)|\bRAP[ÉE]?\b")
145
+
146
+ @staticmethod
147
+ def _normalize_txt(s: str) -> str:
148
+ import unicodedata, re
149
+ if not s: return ""
150
+ s = s.upper()
151
+ s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
152
+ s = re.sub(r"[^A-Z0-9% ]+", " ", s)
153
+ return re.sub(r"\s+", " ", s).strip()
154
+
155
+ def forward(self, text: str) -> str:
156
+ import json, re
157
+ s = self._normalize_txt(text); c=[]
158
+ if self.SOFT.search(s): c.append({"code":"01.1.4.5.2","why":"pâte molle/persillée","score":0.95})
159
+ if self.PRESS.search(s): c.append({"code":"01.1.4.5.3","why":"pâte pressée","score":0.90})
160
+ if self.GOAT.search(s): c.append({"code":"01.1.4.5.4","why":"chèvre","score":0.90})
161
+ if self.PROC.search(s): c.append({"code":"01.1.4.5.5","why":"fondu/râpé/portions","score":0.85})
162
+ if not c and re.search(r"\bFROMAGE\b", s): c.append({"code":"01.1.4.5","why":"générique fromage/laits caillés","score":0.6})
163
+ return json.dumps({"candidates": c})
164
+
165
+
166
+ # ---- OFFtoCOICOP : normalisation locale + regex règles ----
167
+ class OFFtoCOICOP(Tool):
168
+ name, description = "map_off_to_coicop", "Mappe catégories OFF vers COICOP."
169
+ inputs = {
170
+ "product_name": {"type":"string", "description":"Nom produit OFF (fr/en).", "nullable": True},
171
+ "categories_tags": {"type":"array", "description":"Liste OFF categories_tags.", "nullable": True},
172
+ "ingredients_text":{"type":"string","description":"Texte ingrédients.", "nullable": True},
173
+ }
174
+ output_type="string"
175
+ import re as _re
176
+ RULES = [
177
+ (_re.compile(r"\b(CAMEMBERT|BRIE|COULOMMIERS|BLUE CHEESE|ROQUEFORT|GORGONZOLA|MUNSTER|REBLOCHON)\b"), ("01.1.4.5.2",0.95,"OFF: pâte molle/persillée")),
178
+ (_re.compile(r"\b(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)\b"), ("01.1.4.5.3",0.90,"OFF: pâte pressée")),
179
+ (_re.compile(r"\b(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)\b"), ("01.1.4.5.4",0.90,"OFF: chèvre")),
180
+ (_re.compile(r"\b(FONDU|FONDUES?|RAPE|RÂPE|PORTIONS?|KIRI|VACHE QUI RIT|CARRE FRAIS|CARR[ÉE] FRAIS)\b"), ("01.1.4.5.5",0.85,"OFF: fondu/rapé/portions")),
181
+ ]
182
+
183
+ @staticmethod
184
+ def _normalize_txt(s: str) -> str:
185
+ import unicodedata, re
186
+ if not s: return ""
187
+ s = s.upper()
188
+ s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
189
+ s = re.sub(r"[^A-Z0-9% ]+", " ", s)
190
+ return re.sub(r"\s+", " ", s).strip()
191
+
192
+ def forward(self, product_name=None, categories_tags=None, ingredients_text=None) -> str:
193
+ import json
194
+ text = " ".join([t for t in [
195
+ self._normalize_txt(product_name or ""),
196
+ self._normalize_txt(" ".join(categories_tags or [])),
197
+ self._normalize_txt(ingredients_text or "")
198
+ ] if t])
199
+ c=[]
200
+ for rx,(code,score,why) in self.RULES:
201
+ if rx.search(text): c.append({"code":code,"why":why,"score":score})
202
+ return json.dumps({"candidates": c})
203
+
204
+
205
+ # ---- SemSim : COICOP embarqué + import lazy du modèle ----
206
+ class SemSim(Tool):
207
+ name, description = "coicop_semantic_similarity", "Embeddings → top-k COICOP."
208
+ inputs = {"text":{"type":"string","description":"Texte libellé"},
209
+ "topk":{"type":"integer","description":"Nombre de candidats (défaut 5)","nullable":True}}
210
+ output_type = "string"
211
+ # packages nécessaires côté Hub
212
+ requirements = ["sentence_transformers", "torch"]
213
+
214
+ # mini référentiel embarqué pour l'export Hub
215
+ COICOP_ITEMS = [
216
+ {"code": "01.1.4.5.1", "label": "Laits caillés, fromage blanc, petites crèmes fromagères"},
217
+ {"code": "01.1.4.5.2", "label": "Fromage à pâte molle et à pâte persillée"},
218
+ {"code": "01.1.4.5.3", "label": "Fromage à pâte pressée"},
219
+ {"code": "01.1.4.5.4", "label": "Fromage de chèvre"},
220
+ {"code": "01.1.4.5.5", "label": "Fromages fondus, râpés, portions"},
221
+ {"code": "01.1.1.4", "label": "Pain"},
222
+ {"code": "01.1.1.1", "label": "Riz"},
223
+ {"code": "01.1.1.3", "label": "Pâtes, couscous et produits similaires"},
224
+ ]
225
+
226
+ @staticmethod
227
+ def _normalize_txt(s: str) -> str:
228
+ import unicodedata, re
229
+ if not s: return ""
230
+ s = s.upper()
231
+ s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
232
+ s = re.sub(r"[^A-Z0-9% ]+", " ", s)
233
+ return re.sub(r"\s+", " ", s).strip()
234
+
235
+ def forward(self, text: str, topk: int = 5) -> str:
236
+ import json
237
+ from sentence_transformers import SentenceTransformer, util
238
+ # lazy init pour la compat hub
239
+ if not hasattr(self, "_model"):
240
+ self._model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
241
+ q = self._normalize_txt(text)
242
+ q_emb = self._model.encode([q], normalize_embeddings=True)
243
+ labels = [f"{it['code']} {it['label']}" for it in self.COICOP_ITEMS]
244
+ L = self._model.encode(labels, normalize_embeddings=True)
245
+ sims = util.cos_sim(q_emb, L).tolist()[0]
246
+ ranked = sorted(
247
+ [{"code": self.COICOP_ITEMS[i]["code"], "label": self.COICOP_ITEMS[i]["label"], "score": float(sims[i])}
248
+ for i in range(len(self.COICOP_ITEMS))],
249
+ key=lambda x: x["score"], reverse=True
250
+ )
251
+ return json.dumps({"candidates": ranked[:max(1,int(topk))]})
252
+
253
+
254
+ # ---- Resolve : import local json ----
255
+ class Resolve(Tool):
256
+ name, description = "resolve_coicop_candidates", "Fusionne candidats → choix final + alternatives + explication."
257
+ inputs = {"json_lists": {"type":"array","description":"Liste de JSON (str) d'autres tools."},
258
+ "topn":{"type":"integer","description":"Nb d'alternatives (défaut 3)","nullable":True}}
259
+ output_type = "string"
260
+
261
+ def forward(self, json_lists, topn: int = 3) -> str:
262
+ import json
263
+ from typing import Dict, Any
264
+ bucket: Dict[str, Dict[str, Any]] = {}
265
+ for s in json_lists:
266
+ data = json.loads(s) if s else {}
267
+ for c in data.get("candidates", []):
268
+ code = c["code"]; score = float(c.get("score", 0.0))
269
+ why = c.get("why", "") or c.get("label", "")
270
+ if code not in bucket:
271
+ bucket[code] = {"code":code,"score":score,"votes":1,"evidences":[why] if why else []}
272
+ else:
273
+ bucket[code]["score"] = max(bucket[code]["score"], score)
274
+ bucket[code]["votes"] += 1
275
+ if why: bucket[code]["evidences"].append(why)
276
+ for v in bucket.values():
277
+ v["score_final"] = v["score"] + 0.05*(v["votes"]-1)
278
+ ranked = sorted(bucket.values(), key=lambda x: x["score_final"], reverse=True)
279
+ if not ranked: return json.dumps({"final": None, "alternatives": [], "explanation":"Aucun candidat"})
280
+ final = ranked[0]; alts = ranked[1:1+max(0,int(topn))]
281
+ exp = f"Choix {final['code']} (score {final['score_final']:.2f}) – votes={final['votes']} – raisons: {', '.join(sorted(set(final['evidences'])))}"
282
+ return json.dumps({"final": final, "alternatives": alts, "explanation": exp})
283
+
284
+
285
+
286
+ def build_agent(model_id: str | None = None) -> CodeAgent:
287
+ model_id = model_id or "Qwen/Qwen2.5-Coder-7B-Instruct" # léger pour tester
288
+ agent = CodeAgent(
289
+ tools=[ValidateEANTool(), OFFByEAN(), RegexCOICOP(), OFFtoCOICOP(), SemSim(), Resolve()],
290
+ model=InferenceClientModel(model_id=model_id),
291
+ add_base_tools=False,
292
+ max_steps=6,
293
+ verbosity_level=2,
294
+ )
295
+ return agent
296
+
297
+ def parse_result(res):
298
+ if isinstance(res, dict): return res
299
+ try: return ast.literal_eval(res)
300
+ except Exception: return {"raw": res}
301
+
302
+ if __name__ == "__main__":
303
+ # Remplace par les vraies données si possible - uniquement du test
304
+ ean = "3256221112345" # EAN fictif (peut ne pas exister sur OFF)
305
+ label = "Camembert au lait cru AOP 250g - ALDI"
306
+
307
+ agent = build_agent()
308
+ task = f"""
309
+ Classe ce produit en COICOP:
310
+ EAN: {ean}
311
+ Libellé: {label}
312
+ Pipeline:
313
+ 1) validate_ean(ean)
314
+ 2) openfoodfacts_product_by_ean(ean) # si OFF ne trouve pas, on s'appuie sur regex + embeddings
315
+ 3) map_off_to_coicop(product_name, categories_tags, ingredients_text)
316
+ 4) coicop_regex_rules(text=libellé)
317
+ 5) coicop_semantic_similarity(text=libellé, topk=5)
318
+ 6) resolve_coicop_candidates([...], topn=3)
319
+ Attend un JSON final.
320
+ """
321
+ out = agent.run(task)
322
+ print(parse_result(out))
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ smolagents
2
+ huggingface_hub
3
+ gradio
4
+ requests
5
+ sentence-transformers
6
+ torch