PaperBrainAI / backend /app /rag_evaluator.py
=Apyhtml20
Initial deploy
99b596a
import json
import re
import os
from huggingface_hub import InferenceClient
HF_TOKEN = os.getenv("HF_TOKEN", "")
MODEL_NAME = os.getenv("HF_MODEL", "Qwen/Qwen2.5-72B-Instruct")
_client: InferenceClient | None = None
def _get_client() -> InferenceClient:
global _client
if _client is None:
_client = InferenceClient(model=MODEL_NAME, token=HF_TOKEN or None)
return _client
def _call_hf(prompt: str, max_tokens: int = 256, temperature: float = 0.1) -> str:
client = _get_client()
response = client.text_generation(
prompt,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=False, # deterministic for evaluation
return_full_text=False,
)
return response.strip()
def _extract_score(raw: str) -> float:
try:
cleaned = re.sub(r'```(?:json)?\s*|```', '', raw).strip()
data = json.loads(cleaned)
if isinstance(data, dict):
for key in ["score", "value", "result", "rating"]:
if key in data:
val = float(data[key])
return max(0.0, min(1.0, val if val <= 1.0 else val / 10.0))
except Exception:
pass
matches = re.findall(r'\b(0\.\d+|1\.0|[0-9](?:\.[0-9]+)?)\b', raw)
for m in matches:
val = float(m)
if 0.0 <= val <= 1.0:
return val
if 1.0 < val <= 10.0:
return val / 10.0
raw_lower = raw.lower()
if any(w in raw_lower for w in ["excellent", "perfect", "fully", "completely"]):
return 0.9
if any(w in raw_lower for w in ["good", "mostly", "largely"]):
return 0.7
if any(w in raw_lower for w in ["partial", "somewhat", "moderate"]):
return 0.5
if any(w in raw_lower for w in ["poor", "barely", "little"]):
return 0.3
if any(w in raw_lower for w in ["no", "none", "not", "fail"]):
return 0.1
return 0.5
def _parse_result(raw: str) -> tuple[float, str]:
score = _extract_score(raw)
reason = "No reason provided."
try:
cleaned = re.sub(r'```(?:json)?\s*|```', '', raw).strip()
data = json.loads(cleaned)
reason = data.get("reason", reason)
except Exception:
m = re.search(r'"reason"\s*:\s*"([^"]+)"', raw)
if m:
reason = m.group(1)
return round(score, 2), reason
# ── Evaluation functions ──────────────────────────────────────────────────────
def evaluate_faithfulness(question: str, context: str, answer: str) -> dict:
prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue la FIDÉLITÉ de la réponse.
La fidélité mesure si la réponse est entièrement fondée sur le contexte fourni.
Question : {question}
Contexte : {context[:2000]}
Réponse : {answer[:1000]}
Note de 0.0 à 1.0 (1.0 = entièrement fondée sur le contexte, 0.0 = totalement hallucinée).
Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
"""
raw = _call_hf(prompt)
score, reason = _parse_result(raw)
return {"score": score, "reason": reason, "raw": raw[:200]}
def evaluate_answer_relevancy(question: str, answer: str) -> dict:
prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue la PERTINENCE DE LA RÉPONSE.
La pertinence mesure si la réponse répond directement à la question posée.
Question : {question}
Réponse : {answer[:1000]}
Note de 0.0 à 1.0 (1.0 = répond parfaitement, 0.0 = hors sujet).
Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
"""
raw = _call_hf(prompt)
score, reason = _parse_result(raw)
return {"score": score, "reason": reason, "raw": raw[:200]}
def evaluate_context_recall(question: str, context: str) -> dict:
prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue le RAPPEL DU CONTEXTE.
Mesure si le contexte récupéré contient les informations nécessaires pour répondre à la question.
Question : {question}
Contexte récupéré : {context[:2000]}
Note de 0.0 à 1.0 (1.0 = contexte idéal, 0.0 = contexte inutile).
Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
"""
raw = _call_hf(prompt)
score, reason = _parse_result(raw)
return {"score": score, "reason": reason, "raw": raw[:200]}
def evaluate_hallucination(question: str, context: str, answer: str) -> dict:
prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Détecte les HALLUCINATIONS dans la réponse.
Une hallucination = information présente dans la réponse mais ABSENTE du contexte et non-connaissance générale.
Question : {question}
Contexte : {context[:2000]}
Réponse : {answer[:1000]}
Note de 0.0 à 1.0 (1.0 = aucune hallucination, 0.0 = totalement hallucinée).
Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
"""
raw = _call_hf(prompt)
score, reason = _parse_result(raw)
return {"score": score, "reason": reason, "raw": raw[:200]}
def evaluate_rag_response(question: str, context: str, answer: str) -> dict:
print(f"[RAG EVAL] Démarrage pour : {question[:80]}")
results: dict[str, dict] = {}
for key, fn, args in [
("faithfulness", evaluate_faithfulness, (question, context, answer)),
("answer_relevancy", evaluate_answer_relevancy, (question, answer)),
("context_recall", evaluate_context_recall, (question, context)),
("hallucination", evaluate_hallucination, (question, context, answer)),
]:
try:
results[key] = fn(*args)
print(f"[RAG EVAL] {key}: {results[key]['score']}")
except Exception as e:
results[key] = {"score": 0.0, "reason": str(e), "error": True}
weights = {
"faithfulness": 0.35,
"answer_relevancy": 0.30,
"context_recall": 0.20,
"hallucination": 0.15,
}
overall = round(sum(
results[k]["score"] * w
for k, w in weights.items()
if not results[k].get("error")
), 2)
grade = "A" if overall >= 0.85 else "B" if overall >= 0.70 else "C" if overall >= 0.55 else "D" if overall >= 0.40 else "F"
print(f"[RAG EVAL] Overall: {overall} ({grade})")
return {
"question": question,
"overall_score": overall,
"grade": grade,
"metrics": results,
"summary": _generate_summary(overall, results),
}
def _generate_summary(overall: float, results: dict) -> str:
label_map = {
"faithfulness": "Fidélité",
"answer_relevancy": "Pertinence",
"context_recall": "Rappel contexte",
"hallucination": "Hallucination",
}
weak = [label_map[k] for k, v in results.items() if v["score"] < 0.5 and not v.get("error")]
strong = [label_map[k] for k, v in results.items() if v["score"] >= 0.8 and not v.get("error")]
if overall >= 0.85:
verdict = "Excellente réponse RAG."
elif overall >= 0.70:
verdict = "Bonne réponse avec quelques défauts mineurs."
elif overall >= 0.50:
verdict = "Réponse acceptable — qualité du contexte à améliorer."
else:
verdict = "Réponse insuffisante — uploadez des documents plus pertinents."
parts = []
if strong:
parts.append(f"Points forts : {', '.join(strong)}.")
if weak:
parts.append(f"À améliorer : {', '.join(weak)}.")
return verdict + (" " + " ".join(parts) if parts else "")