|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import re |
|
|
|
import numpy as np |
|
from aenum import extend_enum |
|
|
|
from lighteval.metrics.metrics import Metrics |
|
from lighteval.metrics.metrics_sample import JudgeLLM |
|
from lighteval.metrics.utils.metric_utils import ( |
|
CorpusLevelMetricGrouping, |
|
MetricCategory, |
|
MetricUseCase, |
|
) |
|
from lighteval.tasks.lighteval_task import LightevalTaskConfig |
|
from lighteval.tasks.requests import Doc |
|
|
|
|
|
|
|
|
|
JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a document, a piece of text, a question generated from that text, and the correct or "gold" answer to the question. Additionally, you will receive a model answer. Your task is to determine wether the model answer is correct using the provided "gold" answer as a reference. |
|
# Steps |
|
1. **Document Understanding**: |
|
- Analyze the provided document summary to grasp the context and main themes. |
|
2. **Chunk Understanding**: |
|
- Examine the provided text (chunk) to understand its content. |
|
3. **Question Understanding**: |
|
- Interpret the given question to fully comprehend what is being asked. |
|
4. **Ground Truth Answer Understanding**: |
|
- Understand the provided ground truth answer, identifying its key points. |
|
6. **Answer Understanding**: |
|
- Examine the Model Answer, identifying key points and assessing accuracy and factuality. |
|
7. **Final Answer**: |
|
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct). |
|
|
|
# Evaluation Guidelines |
|
- The model answer should cover the main points mentioned in the gold answer, but doesn't need to be identical. |
|
- If the model answer directly contradicts important information in the gold answer, it should be marked as incorrect (0). |
|
- It's acceptable for the model answer to provide additional information beyond what's in the gold answer, as long as the core information is addressed. |
|
- Be balanced in your evaluation - neither too strict nor too lenient. |
|
|
|
# Output Format |
|
- Provide your final evaluation of whether the answer is correct within `<final_answer>` XML tags. |
|
- Include a detailed analysis for each part within the designated XML tags: `<document_understanding>`, `<chunk_understanding>`, `<question_understanding>`, `<ground_truth_answer_understanding>`, `<model_answer_understanding>`, and `<final_answer>`. |
|
# Examples |
|
**Input**: |
|
```xml |
|
<document_summary> |
|
[Summary] |
|
</document_summary> |
|
<piece_of_text> |
|
[Text] |
|
</piece_of_text> |
|
<question> |
|
[Question] |
|
</question> |
|
<gold_answer> |
|
[Gold Answer] |
|
</gold_answer> |
|
<model_answer> |
|
[Model Answer] |
|
</model_answer> |
|
``` |
|
**Output**: |
|
```xml |
|
<document_understanding> |
|
Understanding of the summary including key themes |
|
</document_understanding> |
|
<chunk_understanding> |
|
Analysis of the piece of text |
|
</chunk_understanding> |
|
<question_understanding> |
|
Comprehension of the question being asked |
|
</question_understanding> |
|
<ground_truth_answer_understanding> |
|
Key points from the gold answer |
|
</ground_truth_answer_understanding> |
|
<model_answer_understanding> |
|
Key points and accuracy of Answer A |
|
</model_answer_understanding> |
|
<final_answer> |
|
1 or 0 (1 if the model answer is correct, 0 if it is incorrect) |
|
</final_answer> |
|
``` |
|
# Notes |
|
- Always focus on key points and factual correctness as per the ground truth. |
|
- Avoid any biases and rely solely on the evidence presented. |
|
- Enclose all evaluations and analyses in the specified XML tags for clarity and structure.""" |
|
|
|
|
|
JUDGE_ANSWER_USER_PROMPT = """<document_summary> |
|
{summary} |
|
</document_summary> |
|
<piece_of_text> |
|
{chunk} |
|
</piece_of_text> |
|
<question> |
|
{question} |
|
</question> |
|
<gold_answer> |
|
{oracle_answer} |
|
</gold_answer> |
|
<model_answer> |
|
{model_answer} |
|
</model_answer>""" |
|
|
|
|
|
def get_judge_prompt(question: str, answer: str, gold: str, **kwargs): |
|
chunk = kwargs.get("chunks", "") |
|
summary = kwargs.get("documents", "") |
|
|
|
return [ |
|
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT}, |
|
{ |
|
"role": "user", |
|
"content": JUDGE_ANSWER_USER_PROMPT.format( |
|
summary=summary, chunk=chunk, question=question, oracle_answer=gold, model_answer=answer |
|
), |
|
}, |
|
] |
|
|
|
|
|
def process_judge_response_yourbench(response): |
|
|
|
|
|
|
|
|
|
if isinstance(response, dict): |
|
|
|
if "content" in response: |
|
response = response["content"] |
|
|
|
elif "text" in response: |
|
response = response["text"] |
|
|
|
elif "response" in response: |
|
response = response["response"] |
|
|
|
else: |
|
|
|
response = str(list(response.values())[0]) |
|
|
|
|
|
|
|
if isinstance(response, list): |
|
|
|
if len(response) > 0: |
|
if isinstance(response[0], dict) and "content" in response[0]: |
|
response = response[0]["content"] |
|
|
|
else: |
|
response = str(response[0]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
response_str = str(response).lower() |
|
|
|
|
|
negative_patterns = [ |
|
r"\bincorrect\b", |
|
r"\bwrong\b", |
|
r"\bnot correct\b", |
|
r"\binaccurate\b", |
|
r"\bnot accurate\b", |
|
r"\bmisses\b", |
|
r"\bdoes not match\b", |
|
r"\bfail\b", |
|
r"\b0\b" |
|
] |
|
|
|
|
|
for pattern in negative_patterns: |
|
if re.search(pattern, response_str): |
|
|
|
return 0 |
|
|
|
|
|
|
|
return 1 |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
return 0 |
|
|
|
|
|
class JudgeLLMYourBench(JudgeLLM): |
|
def __init__(self): |
|
super().__init__( |
|
judge_model_name="gpt-4o-2024-08-06", |
|
template=get_judge_prompt, |
|
process_judge_response=process_judge_response_yourbench, |
|
judge_backend="openai", |
|
short_judge_name="yourbench_judge", |
|
) |
|
|
|
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]: |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs] |
|
golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs] |
|
predictions = [response[0].result[0] for response in responses] |
|
options = [None] * len(questions) |
|
|
|
|
|
chunks = [] |
|
for doc in formatted_docs: |
|
if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0: |
|
chunks.append(doc.specific["chunks"][0]) |
|
else: |
|
|
|
chunks.append("") |
|
|
|
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scores = [] |
|
for i in range(len(questions)): |
|
prediction = str(predictions[i]).lower() |
|
gold = str(golds[i]).lower() |
|
|
|
|
|
key_terms = [word for word in gold.split() if len(word) > 4] |
|
|
|
|
|
matches = sum(1 for term in key_terms if term in prediction) |
|
coverage = matches / len(key_terms) if key_terms else 0 |
|
|
|
|
|
|
|
score = 1.0 if coverage >= 0.4 else 0.0 |
|
|
|
|
|
|
|
|
|
scores.append(score) |
|
|
|
|
|
|
|
metrics = [] |
|
for i in range(len(sample_ids)): |
|
metrics.append( |
|
{ |
|
"accuracy": scores[i], |
|
} |
|
) |
|
|
|
return metrics |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
|
|
return [{"accuracy": 0.0} for _ in sample_ids] |
|
|
|
|
|
ZEROSHOT_QA_USER_PROMPT = """Answer the following question: |
|
<question> |
|
{question} |
|
</question> |
|
Enclose your full answer in <answer> XML tags. For example: |
|
<answer> |
|
[your answer here] |
|
</answer>""" |
|
|
|
|
|
def yourbench_prompt(line, task_name: str = ""): |
|
return Doc( |
|
task_name=task_name, |
|
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]), |
|
choices=[line["self_answer"]], |
|
gold_index=0, |
|
specific={ |
|
"question_category": line["self_assessed_question_type"], |
|
"kind": "qa", |
|
"estimated_difficulty": line["estimated_difficulty"], |
|
"document_id": line["document_id"], |
|
"question_generating_model": line["generating_model"], |
|
"chunks": line["citations"], |
|
"question": line["question"], |
|
"document": line["raw_response"], |
|
}, |
|
) |
|
|
|
|
|
def create_yourbench_task(hf_dataset_name, subset="lighteval_single_shot_questions"): |
|
""" |
|
Crée une tâche personnalisée yourbench pour lighteval. |
|
|
|
Args: |
|
hf_dataset_name: Nom du dataset sur le Hub HF (format: "org/nom") |
|
subset: Nom du sous-ensemble à utiliser |
|
|
|
Returns: |
|
LightevalTaskConfig: Configuration de la tâche yourbench |
|
""" |
|
yourbench_metrics = CorpusLevelMetricGrouping( |
|
metric_name=["accuracy"], |
|
higher_is_better={"accuracy": True}, |
|
category=MetricCategory.LLM_AS_JUDGE, |
|
use_case=MetricUseCase.ACCURACY, |
|
sample_level_fn=JudgeLLMYourBench().compute, |
|
corpus_level_fn={"accuracy": np.mean}, |
|
) |
|
|
|
try: |
|
extend_enum(Metrics, "accuracy", yourbench_metrics) |
|
except Exception: |
|
|
|
pass |
|
|
|
return LightevalTaskConfig( |
|
name="yourbench", |
|
suite=["custom"], |
|
prompt_function=yourbench_prompt, |
|
hf_repo=hf_dataset_name, |
|
hf_subset=subset, |
|
hf_avail_splits=["train"], |
|
evaluation_splits=["train"], |
|
few_shots_split=None, |
|
few_shots_select=None, |
|
generation_size=8192, |
|
metric=[Metrics.accuracy], |
|
stop_sequence=[], |
|
trust_dataset=True, |
|
version=0, |
|
) |