Spaces:

ApyHTML19
/

PaperBrainAI

Sleeping

PaperBrainAI / backend /app /rag_evaluator.py

=Apyhtml20

Initial deploy

99b596a about 1 month ago

7.64 kB

	import json
	import re
	import os
	from huggingface_hub import InferenceClient

	HF_TOKEN = os.getenv("HF_TOKEN", "")
	MODEL_NAME = os.getenv("HF_MODEL", "Qwen/Qwen2.5-72B-Instruct")

	_client: InferenceClient \| None = None


	def _get_client() -> InferenceClient:
	global _client
	if _client is None:
	_client = InferenceClient(model=MODEL_NAME, token=HF_TOKEN or None)
	return _client


	def _call_hf(prompt: str, max_tokens: int = 256, temperature: float = 0.1) -> str:
	client = _get_client()
	response = client.text_generation(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=False, # deterministic for evaluation
	return_full_text=False,
	)
	return response.strip()


	def _extract_score(raw: str) -> float:
	try:
	cleaned = re.sub(r'```(?:json)?\s*\|```', '', raw).strip()
	data = json.loads(cleaned)
	if isinstance(data, dict):
	for key in ["score", "value", "result", "rating"]:
	if key in data:
	val = float(data[key])
	return max(0.0, min(1.0, val if val <= 1.0 else val / 10.0))
	except Exception:
	pass

	matches = re.findall(r'\b(0\.\d+\|1\.0\|[0-9](?:\.[0-9]+)?)\b', raw)
	for m in matches:
	val = float(m)
	if 0.0 <= val <= 1.0:
	return val
	if 1.0 < val <= 10.0:
	return val / 10.0

	raw_lower = raw.lower()
	if any(w in raw_lower for w in ["excellent", "perfect", "fully", "completely"]):
	return 0.9
	if any(w in raw_lower for w in ["good", "mostly", "largely"]):
	return 0.7
	if any(w in raw_lower for w in ["partial", "somewhat", "moderate"]):
	return 0.5
	if any(w in raw_lower for w in ["poor", "barely", "little"]):
	return 0.3
	if any(w in raw_lower for w in ["no", "none", "not", "fail"]):
	return 0.1

	return 0.5


	def _parse_result(raw: str) -> tuple[float, str]:
	score = _extract_score(raw)
	reason = "No reason provided."
	try:
	cleaned = re.sub(r'```(?:json)?\s*\|```', '', raw).strip()
	data = json.loads(cleaned)
	reason = data.get("reason", reason)
	except Exception:
	m = re.search(r'"reason"\s:\s"([^"]+)"', raw)
	if m:
	reason = m.group(1)
	return round(score, 2), reason


	# ── Evaluation functions ──────────────────────────────────────────────────────

	def evaluate_faithfulness(question: str, context: str, answer: str) -> dict:
	prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue la FIDÉLITÉ de la réponse.
	La fidélité mesure si la réponse est entièrement fondée sur le contexte fourni.

	Question : {question}
	Contexte : {context[:2000]}
	Réponse : {answer[:1000]}

	Note de 0.0 à 1.0 (1.0 = entièrement fondée sur le contexte, 0.0 = totalement hallucinée).
	Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
	"""
	raw = _call_hf(prompt)
	score, reason = _parse_result(raw)
	return {"score": score, "reason": reason, "raw": raw[:200]}


	def evaluate_answer_relevancy(question: str, answer: str) -> dict:
	prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue la PERTINENCE DE LA RÉPONSE.
	La pertinence mesure si la réponse répond directement à la question posée.

	Question : {question}
	Réponse : {answer[:1000]}

	Note de 0.0 à 1.0 (1.0 = répond parfaitement, 0.0 = hors sujet).
	Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
	"""
	raw = _call_hf(prompt)
	score, reason = _parse_result(raw)
	return {"score": score, "reason": reason, "raw": raw[:200]}


	def evaluate_context_recall(question: str, context: str) -> dict:
	prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Évalue le RAPPEL DU CONTEXTE.
	Mesure si le contexte récupéré contient les informations nécessaires pour répondre à la question.

	Question : {question}
	Contexte récupéré : {context[:2000]}

	Note de 0.0 à 1.0 (1.0 = contexte idéal, 0.0 = contexte inutile).
	Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
	"""
	raw = _call_hf(prompt)
	score, reason = _parse_result(raw)
	return {"score": score, "reason": reason, "raw": raw[:200]}


	def evaluate_hallucination(question: str, context: str, answer: str) -> dict:
	prompt = f"""<s>[INST] Tu es un évaluateur RAG expert. Détecte les HALLUCINATIONS dans la réponse.
	Une hallucination = information présente dans la réponse mais ABSENTE du contexte et non-connaissance générale.

	Question : {question}
	Contexte : {context[:2000]}
	Réponse : {answer[:1000]}

	Note de 0.0 à 1.0 (1.0 = aucune hallucination, 0.0 = totalement hallucinée).
	Réponds UNIQUEMENT avec : {{"score": <float 0.0-1.0>, "reason": "<une phrase>"}} [/INST]
	"""
	raw = _call_hf(prompt)
	score, reason = _parse_result(raw)
	return {"score": score, "reason": reason, "raw": raw[:200]}


	def evaluate_rag_response(question: str, context: str, answer: str) -> dict:
	print(f"[RAG EVAL] Démarrage pour : {question[:80]}")

	results: dict[str, dict] = {}

	for key, fn, args in [
	("faithfulness", evaluate_faithfulness, (question, context, answer)),
	("answer_relevancy", evaluate_answer_relevancy, (question, answer)),
	("context_recall", evaluate_context_recall, (question, context)),
	("hallucination", evaluate_hallucination, (question, context, answer)),
	]:
	try:
	results[key] = fn(*args)
	print(f"[RAG EVAL] {key}: {results[key]['score']}")
	except Exception as e:
	results[key] = {"score": 0.0, "reason": str(e), "error": True}

	weights = {
	"faithfulness": 0.35,
	"answer_relevancy": 0.30,
	"context_recall": 0.20,
	"hallucination": 0.15,
	}
	overall = round(sum(
	results[k]["score"] * w
	for k, w in weights.items()
	if not results[k].get("error")
	), 2)

	grade = "A" if overall >= 0.85 else "B" if overall >= 0.70 else "C" if overall >= 0.55 else "D" if overall >= 0.40 else "F"
	print(f"[RAG EVAL] Overall: {overall} ({grade})")

	return {
	"question": question,
	"overall_score": overall,
	"grade": grade,
	"metrics": results,
	"summary": _generate_summary(overall, results),
	}


	def _generate_summary(overall: float, results: dict) -> str:
	label_map = {
	"faithfulness": "Fidélité",
	"answer_relevancy": "Pertinence",
	"context_recall": "Rappel contexte",
	"hallucination": "Hallucination",
	}
	weak = [label_map[k] for k, v in results.items() if v["score"] < 0.5 and not v.get("error")]
	strong = [label_map[k] for k, v in results.items() if v["score"] >= 0.8 and not v.get("error")]

	if overall >= 0.85:
	verdict = "Excellente réponse RAG."
	elif overall >= 0.70:
	verdict = "Bonne réponse avec quelques défauts mineurs."
	elif overall >= 0.50:
	verdict = "Réponse acceptable — qualité du contexte à améliorer."
	else:
	verdict = "Réponse insuffisante — uploadez des documents plus pertinents."

	parts = []
	if strong:
	parts.append(f"Points forts : {', '.join(strong)}.")
	if weak:
	parts.append(f"À améliorer : {', '.join(weak)}.")

	return verdict + (" " + " ".join(parts) if parts else "")