Spaces:

FabIndy
/

code-education-rag

Running

App Files Files Community

FabIndy commited on Jan 15

Commit

b4a2740

1 Parent(s): 6a852cd

Fix LIST retrieval with robust FAISS fallback and lexical normalization

Browse files

Files changed (1) hide show

src/rag_core.py +85 -97

src/rag_core.py CHANGED Viewed

@@ -3,9 +3,9 @@
 """
 rag_core.py – Modes :
-- LIST     : rapide (FAISS, pas de LLM) — corrigé : score + seuil + garde lexical + refus si non pertinent
 - FULLTEXT : rapide (texte exact depuis JSONL, pas de LLM)
-- EXPLAIN  : rapide -> en réalité une SYNTHÈSE extractive (text mining), pas une explication
 - QA       : présent, mais accéléré (moins de garde-fous, avertissement utilisateur)
 Notes produit :
@@ -34,16 +34,21 @@ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 SNIPPET_CHARS = 260
 # --- LIST (FIABILITÉ) ---
-# On récupère large puis on filtre : score + lexical + dédup
 LIST_K = int(os.environ.get("LIST_K", "30"))
 LIST_MAX_ARTICLES = int(os.environ.get("LIST_MAX_ARTICLES", "8"))
-# NOTE: Avec FAISS LangChain, le "score" est généralement une distance (plus petit = meilleur).
-# À ajuster sur ton corpus. 0.45–0.75 sont des valeurs usuelles selon l’index.
-LIST_SCORE_THRESHOLD = float(os.environ.get("LIST_SCORE_THRESHOLD", "0.60"))
-# Garde lexical : au moins 1 mot-clé significatif doit apparaître dans le doc
 LIST_REQUIRE_LEXICAL_MATCH = os.environ.get("LIST_REQUIRE_LEXICAL_MATCH", "1") == "1"
 LIST_MIN_KEYWORDS = int(os.environ.get("LIST_MIN_KEYWORDS", "1"))
 # --- EXPLAIN (synthèse extractive) ---
 EXTRACT_MAX_SEGMENTS = 5
 EXTRACT_MAX_CHARS_TOTAL = 900
@@ -51,10 +56,10 @@ EXTRACT_MIN_SEG_LEN = 30
 EXTRACT_MAX_SEG_LEN = 420
 # --- QA : accélération ---
-QA_TOP_K_FINAL = int(os.environ.get("QA_TOP_K_FINAL", "2"))          # 1 ou 2 conseillé sur CPU
-QA_DOC_MAX_CHARS = int(os.environ.get("QA_DOC_MAX_CHARS", "700"))   # tronque le contexte envoyé au LLM
-QA_MAX_TOKENS = int(os.environ.get("QA_MAX_TOKENS", "140"))         # court
-QA_TEMPERATURE = float(os.environ.get("QA_TEMPERATURE", "0.1"))     # stable
 ARTICLE_ID_RE = re.compile(
     r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
@@ -67,12 +72,6 @@ EXPLAIN_TRIGGERS = [
     "extraits", "extrait", "résumé extractif", "resume extractif",
 ]
-EXPLAINISH_WORDS = [
-    "explique", "expliquer", "explication",
-    "résume", "resume", "résumé", "reformule", "simplifie",
-    "en termes simples", "vulgarise", "clarifie",
-]
 LIST_TRIGGERS = [
     "quels articles", "quelles dispositions", "articles parlent",
     "articles qui parlent", "articles sur", "donne les articles",
@@ -100,7 +99,7 @@ _QA_WARNING = (
 # ==================== LLM INIT ====================
-# n_ctx réduit pour accélérer QA sur CPU.
 llm = Llama(
     model_path="models/model.gguf",
     n_ctx=1024,
@@ -130,11 +129,6 @@ def extract_article_id(q: str) -> Optional[str]:
     return normalize_article_id(m.group(1)) if m else None
-def safe_snippet(text: str, n: int) -> str:
-    t = " ".join((text or "").split())
-    return t if len(t) <= n else t[:n].rstrip() + "…"
 def load_article_text(article_id: str) -> Optional[str]:
     if not CHUNKS_PATH.exists():
         raise FileNotFoundError(f"Fichier chunks introuvable : {CHUNKS_PATH}")
@@ -161,10 +155,6 @@ def is_fulltext_request(q: str) -> bool:
 def is_explain_synthesis_request(q: str) -> bool:
-    """
-    EXPLAIN = synthèse extractive si le texte contient des marqueurs explicites de synthèse.
-    (Un ID d'article sera exigé dans le routing.)
-    """
     ql = (q or "").lower()
     return any(t in ql for t in EXPLAIN_TRIGGERS)
@@ -186,7 +176,7 @@ def get_vectorstore() -> FAISS:
     return _VS
-# ==================== LIST: KEYWORDS GUARD (FAST) ====================
 _STOPWORDS_FR = {
     "de", "des", "du", "la", "le", "les", "un", "une", "et", "ou", "a", "à",
@@ -196,34 +186,29 @@ _STOPWORDS_FR = {
     "code", "education", "éducation", "l'", "d'", "du", "des"
 }
 def _extract_keywords_for_list(q: str) -> List[str]:
-    """
-    Extraction très simple de mots-clés (sans NLP lourd) :
-    - on retire les triggers usuels de LIST
-    - on garde des tokens alpha-num >= 3
-    - on retire stopwords
-    """
     ql = (q or "").lower()
-    # enlever quelques formulations fréquentes
     for t in LIST_TRIGGERS:
         ql = ql.replace(t, " ")
-    # tokens (lettres + chiffres + -)
     toks = re.findall(r"[a-z0-9àâäçéèêëîïôöùûüÿ\-]{3,}", ql, flags=re.IGNORECASE)
     toks = [t.strip("-") for t in toks if t.strip("-")]
-    # filtre stopwords
     out = []
     for t in toks:
         if t in _STOPWORDS_FR:
             continue
         if len(t) < 3:
             continue
         out.append(t)
-    # dédup en conservant l’ordre
     seen = set()
     uniq = []
     for t in out:
@@ -234,7 +219,7 @@ def _extract_keywords_for_list(q: str) -> List[str]:
     return uniq
-def _lexical_match(doc_text: str, keywords: List[str]) -> bool:
     if not keywords:
         return False
     low = (doc_text or "").lower()
@@ -242,12 +227,12 @@ def _lexical_match(doc_text: str, keywords: List[str]) -> bool:
     for kw in keywords:
         if kw in low:
             hits += 1
-            if hits >= LIST_MIN_KEYWORDS:
                 return True
     return False
-# ==================== EXTRACTIVE SUMMARY (FAST) ====================
 _NORMATIVE_PATTERNS = [
     r"\bdoit\b", r"\bdoivent\b", r"\best\b", r"\bsont\b",
@@ -260,7 +245,6 @@ _NORMATIVE_PATTERNS = [
     r"\bI\.\b", r"\bII\.\b", r"\bIII\.\b", r"\b1°\b", r"\b2°\b", r"\b3°\b",
 ]
 def _split_into_segments(text: str) -> List[str]:
     if not text:
         return []
@@ -274,7 +258,6 @@ def _split_into_segments(text: str) -> List[str]:
             segs.append(ln)
     return segs
 def _score_segment(seg: str) -> int:
     s = 0
     low = seg.lower()
@@ -287,13 +270,7 @@ def _score_segment(seg: str) -> int:
         s -= 1
     return s
 def extractive_summary(article_id: str, article_text: str) -> str:
-    """
-    SYNTHÈSE extractive (rapide) :
-    - sélection de segments clés (extraction)
-    - aucune génération => zéro hallucination
-    """
     segs = _split_into_segments(article_text)
     cleaned: List[str] = []
     for s in segs:
@@ -328,7 +305,7 @@ def extractive_summary(article_id: str, article_text: str) -> str:
     return f"{body}\n\nArticles cités : {article_id}"
-# ==================== QA PROMPT (FAST) ====================
 def _truncate(s: str, n: int) -> str:
     if not s:
@@ -336,7 +313,6 @@ def _truncate(s: str, n: int) -> str:
     s = s.strip()
     return s if len(s) <= n else s[:n].rstrip() + "\n[...]\n"
 def build_qa_prompt_fast(question: str, context: str, sources: List[str]) -> str:
     src = ", ".join(sources)
     return f"""Tu es un assistant qui aide à comprendre le Code de l'éducation (France).
@@ -344,7 +320,7 @@ def build_qa_prompt_fast(question: str, context: str, sources: List[str]) -> str
 CONTRAINTE :
 - Appuie-toi en priorité sur le CONTEXTE fourni.
 - Si l'information n'est pas dans le contexte, dis-le simplement.
-- Réponse courte, pratique, 5-8 phrases max.
 QUESTION :
 {question}
@@ -358,73 +334,86 @@ Indique à la fin : "Sources (articles) : {src}"
 # ==================== CORE ====================
-def answer_query(q: str) -> Dict[str, Any]:
-    q = (q or "").strip()
-    if not q:
-        return {"mode": "QA", "answer": _REFUSAL, "articles": []}
-    article_id = extract_article_id(q)
-    # ---------- FULLTEXT ----------
-    if article_id and is_fulltext_request(q):
-        text = load_article_text(article_id)
-        return {"mode": "FULLTEXT", "answer": text or _REFUSAL, "articles": [article_id]}
-    # ---------- LIST (CORRIGÉ) ----------
-    if is_list_request(q):
-        vs = get_vectorstore()
-        # Keywords pour garde lexical (très rapide)
-        keywords = _extract_keywords_for_list(q)
-        # Petit enrichissement "léger" pour stabiliser les embeddings
-        # (souvent utile sur corpus juridique)
-        list_query = f"articles sur {q}"
-        # Récupération large + score (distance FAISS)
-        scored_docs: List[Tuple[Any, float]] = vs.similarity_search_with_score(list_query, k=LIST_K)
         kept: List[Tuple[str, float]] = []
         for d, score in scored_docs:
             aid = normalize_article_id(d.metadata.get("article_id", ""))
             if not aid:
                 continue
-            # Filtre score : on garde seulement si c'est suffisamment proche
-            if score > LIST_SCORE_THRESHOLD:
                 continue
-            # Filtre lexical : au moins 1 mot clé doit apparaître dans le contenu
-            if LIST_REQUIRE_LEXICAL_MATCH and keywords:
-                if not _lexical_match(d.page_content or "", keywords):
                     continue
             kept.append((aid, score))
-        # Tri par score croissant (meilleur d'abord) + dédup
-        kept_sorted = sorted(kept, key=lambda x: x[1])
         seen = set()
-        articles: List[str] = []
         for aid, _ in kept_sorted:
             if aid in seen:
                 continue
             seen.add(aid)
-            articles.append(aid)
-            if len(articles) >= LIST_MAX_ARTICLES:
                 break
-        if not articles:
             msg = (
-                "Je n’ai pas trouvé d’articles suffisamment pertinents pour ce thème.\n"
-                "Conseil : précise ta demande (ex : « conseil de classe composition », "
-                "« conseil de classe horaires », « conseil de classe bulletin ») "
-                "ou utilise « Texte exact » si tu connais déjà l’article."
             )
-            return {"mode": "LIST", "answer": msg, "articles": []}
-        return {"mode": "LIST", "answer": "", "articles": articles}
-    # ---------- EXPLAIN (SYNTHÈSE extractive) ----------
     if is_explain_synthesis_request(q):
         if not article_id:
             return {"mode": "EXPLAIN", "answer": _EXPLAIN_REFUSAL, "articles": []}
@@ -436,7 +425,7 @@ def answer_query(q: str) -> Dict[str, Any]:
         summary = extractive_summary(article_id, text)
         return {"mode": "EXPLAIN", "answer": summary, "articles": [article_id]}
-    # ---------- QA (FAST) ----------
     vs = get_vectorstore()
     docs = vs.similarity_search(q, k=max(1, QA_TOP_K_FINAL))
     sources = [normalize_article_id(d.metadata.get("article_id", "")) for d in docs]
@@ -448,10 +437,9 @@ def answer_query(q: str) -> Dict[str, Any]:
         ctx_parts.append(f"[{aid}]\n{txt}")
     context = "\n\n".join(ctx_parts).strip()
     prompt = build_qa_prompt_fast(q, context, sources)
-    ans = llm_generate_qa(prompt).strip()
     final = f"{_QA_WARNING}\n\n{ans}"
     return {"mode": "QA", "answer": final, "articles": sources}

 """
 rag_core.py – Modes :
+- LIST     : rapide (FAISS, pas de LLM) — robuste : 2 passes (strict puis fallback)
 - FULLTEXT : rapide (texte exact depuis JSONL, pas de LLM)
+- EXPLAIN  : rapide -> synthèse extractive (text mining), pas une explication
 - QA       : présent, mais accéléré (moins de garde-fous, avertissement utilisateur)
 Notes produit :
 SNIPPET_CHARS = 260
 # --- LIST (FIABILITÉ) ---
 LIST_K = int(os.environ.get("LIST_K", "30"))
 LIST_MAX_ARTICLES = int(os.environ.get("LIST_MAX_ARTICLES", "8"))
+# Seuil sur distance FAISS (plus petit = meilleur).
+# Par défaut : tolérant (sinon LIST tombe à 0 trop facilement).
+LIST_SCORE_THRESHOLD = float(os.environ.get("LIST_SCORE_THRESHOLD", "0.80"))
+# Lexical guard : utile, mais doit être "fallbackable"
 LIST_REQUIRE_LEXICAL_MATCH = os.environ.get("LIST_REQUIRE_LEXICAL_MATCH", "1") == "1"
 LIST_MIN_KEYWORDS = int(os.environ.get("LIST_MIN_KEYWORDS", "1"))
+# Fallback si 0 résultat : on relâche le lexical et/ou le seuil
+LIST_FALLBACK_RELAX_LEXICAL = os.environ.get("LIST_FALLBACK_RELAX_LEXICAL", "1") == "1"
+LIST_FALLBACK_SCORE_THRESHOLD = float(os.environ.get("LIST_FALLBACK_SCORE_THRESHOLD", "1.10"))
 # --- EXPLAIN (synthèse extractive) ---
 EXTRACT_MAX_SEGMENTS = 5
 EXTRACT_MAX_CHARS_TOTAL = 900
 EXTRACT_MAX_SEG_LEN = 420
 # --- QA : accélération ---
+QA_TOP_K_FINAL = int(os.environ.get("QA_TOP_K_FINAL", "2"))
+QA_DOC_MAX_CHARS = int(os.environ.get("QA_DOC_MAX_CHARS", "700"))
+QA_MAX_TOKENS = int(os.environ.get("QA_MAX_TOKENS", "160"))
+QA_TEMPERATURE = float(os.environ.get("QA_TEMPERATURE", "0.2"))
 ARTICLE_ID_RE = re.compile(
     r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
     "extraits", "extrait", "résumé extractif", "resume extractif",
 ]
 LIST_TRIGGERS = [
     "quels articles", "quelles dispositions", "articles parlent",
     "articles qui parlent", "articles sur", "donne les articles",
 # ==================== LLM INIT ====================
 llm = Llama(
     model_path="models/model.gguf",
     n_ctx=1024,
     return normalize_article_id(m.group(1)) if m else None
 def load_article_text(article_id: str) -> Optional[str]:
     if not CHUNKS_PATH.exists():
         raise FileNotFoundError(f"Fichier chunks introuvable : {CHUNKS_PATH}")
 def is_explain_synthesis_request(q: str) -> bool:
     ql = (q or "").lower()
     return any(t in ql for t in EXPLAIN_TRIGGERS)
     return _VS
+# ==================== LIST: KEYWORDS GUARD ====================
 _STOPWORDS_FR = {
     "de", "des", "du", "la", "le", "les", "un", "une", "et", "ou", "a", "à",
     "code", "education", "éducation", "l'", "d'", "du", "des"
 }
+def _simple_singularize(token: str) -> str:
+    # mini heuristique : conseils -> conseil, classes -> classe
+    if token.endswith("s") and len(token) >= 5:
+        return token[:-1]
+    return token
 def _extract_keywords_for_list(q: str) -> List[str]:
     ql = (q or "").lower()
     for t in LIST_TRIGGERS:
         ql = ql.replace(t, " ")
     toks = re.findall(r"[a-z0-9àâäçéèêëîïôöùûüÿ\-]{3,}", ql, flags=re.IGNORECASE)
     toks = [t.strip("-") for t in toks if t.strip("-")]
     out = []
     for t in toks:
+        t = _simple_singularize(t)
         if t in _STOPWORDS_FR:
             continue
         if len(t) < 3:
             continue
         out.append(t)
     seen = set()
     uniq = []
     for t in out:
     return uniq
+def _lexical_match(doc_text: str, keywords: List[str], min_hits: int) -> bool:
     if not keywords:
         return False
     low = (doc_text or "").lower()
     for kw in keywords:
         if kw in low:
             hits += 1
+            if hits >= min_hits:
                 return True
     return False
+# ==================== EXTRACTIVE SUMMARY ====================
 _NORMATIVE_PATTERNS = [
     r"\bdoit\b", r"\bdoivent\b", r"\best\b", r"\bsont\b",
     r"\bI\.\b", r"\bII\.\b", r"\bIII\.\b", r"\b1°\b", r"\b2°\b", r"\b3°\b",
 ]
 def _split_into_segments(text: str) -> List[str]:
     if not text:
         return []
             segs.append(ln)
     return segs
 def _score_segment(seg: str) -> int:
     s = 0
     low = seg.lower()
         s -= 1
     return s
 def extractive_summary(article_id: str, article_text: str) -> str:
     segs = _split_into_segments(article_text)
     cleaned: List[str] = []
     for s in segs:
     return f"{body}\n\nArticles cités : {article_id}"
+# ==================== QA PROMPT ====================
 def _truncate(s: str, n: int) -> str:
     if not s:
     s = s.strip()
     return s if len(s) <= n else s[:n].rstrip() + "\n[...]\n"
 def build_qa_prompt_fast(question: str, context: str, sources: List[str]) -> str:
     src = ", ".join(sources)
     return f"""Tu es un assistant qui aide à comprendre le Code de l'éducation (France).
 CONTRAINTE :
 - Appuie-toi en priorité sur le CONTEXTE fourni.
 - Si l'information n'est pas dans le contexte, dis-le simplement.
+- Réponse courte, pratique, 6-10 phrases max.
 QUESTION :
 {question}
 # ==================== CORE ====================
+def _list_articles(theme_query: str) -> Dict[str, Any]:
+    vs = get_vectorstore()
+    keywords = _extract_keywords_for_list(theme_query)
+    # Enrichissement léger pour embedding
+    list_query = f"articles sur {theme_query}"
+    scored_docs: List[Tuple[Any, float]] = vs.similarity_search_with_score(list_query, k=LIST_K)
+    def run_pass(score_threshold: float, require_lexical: bool) -> List[str]:
         kept: List[Tuple[str, float]] = []
         for d, score in scored_docs:
             aid = normalize_article_id(d.metadata.get("article_id", ""))
             if not aid:
                 continue
+            if score > score_threshold:
                 continue
+            if require_lexical and keywords:
+                if not _lexical_match(d.page_content or "", keywords, LIST_MIN_KEYWORDS):
                     continue
             kept.append((aid, score))
+        kept_sorted = sorted(kept, key=lambda x: x[1])  # meilleur d'abord
         seen = set()
+        out: List[str] = []
         for aid, _ in kept_sorted:
             if aid in seen:
                 continue
             seen.add(aid)
+            out.append(aid)
+            if len(out) >= LIST_MAX_ARTICLES:
                 break
+        return out
+    # Pass 1 : strict
+    articles = run_pass(LIST_SCORE_THRESHOLD, LIST_REQUIRE_LEXICAL_MATCH)
+    # Pass 2 : fallback (on veut éviter "0 résultat")
+    if not articles and LIST_FALLBACK_RELAX_LEXICAL:
+        articles = run_pass(LIST_FALLBACK_SCORE_THRESHOLD, False)
+        if articles:
             msg = (
+                "Résultats approximatifs : le thème ne correspond pas textuellement aux passages indexés.\n"
+                "Conseil : précise (ex : « conseil de classe composition » / « vacances scolaires calendrier »), "
+                "puis vérifie via « Texte exact »."
             )
+            return {"mode": "LIST", "answer": msg, "articles": articles}
+    if not articles:
+        msg = (
+            "Je n’ai pas trouvé d’articles suffisamment pertinents pour ce thème.\n"
+            "Conseil : précise ta demande (ex : « conseil de classe composition », "
+            "« vacances scolaires calendrier ») ou utilise « Question (QA) » (plus lent)."
+        )
+        return {"mode": "LIST", "answer": msg, "articles": []}
+    return {"mode": "LIST", "answer": "", "articles": articles}
+def answer_query(q: str) -> Dict[str, Any]:
+    q = (q or "").strip()
+    if not q:
+        return {"mode": "QA", "answer": _REFUSAL, "articles": []}
+    article_id = extract_article_id(q)
+    # FULLTEXT
+    if article_id and is_fulltext_request(q):
+        text = load_article_text(article_id)
+        return {"mode": "FULLTEXT", "answer": text or _REFUSAL, "articles": [article_id]}
+    # LIST
+    if is_list_request(q):
+        return _list_articles(q)
+    # EXPLAIN (synthèse extractive)
     if is_explain_synthesis_request(q):
         if not article_id:
             return {"mode": "EXPLAIN", "answer": _EXPLAIN_REFUSAL, "articles": []}
         summary = extractive_summary(article_id, text)
         return {"mode": "EXPLAIN", "answer": summary, "articles": [article_id]}
+    # QA (FAST)
     vs = get_vectorstore()
     docs = vs.similarity_search(q, k=max(1, QA_TOP_K_FINAL))
     sources = [normalize_article_id(d.metadata.get("article_id", "")) for d in docs]
         ctx_parts.append(f"[{aid}]\n{txt}")
     context = "\n\n".join(ctx_parts).strip()
     prompt = build_qa_prompt_fast(q, context, sources)
+    ans = llm_generate_qa(prompt).strip()
     final = f"{_QA_WARNING}\n\n{ans}"
     return {"mode": "QA", "answer": final, "articles": sources}