Upload llm_as_judge.py with huggingface_hub
Browse files- llm_as_judge.py +58 -0
llm_as_judge.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, List
|
2 |
+
|
3 |
+
import evaluate
|
4 |
+
|
5 |
+
from .api import produce
|
6 |
+
from .inference import InferenceEngine
|
7 |
+
from .metrics import BulkInstanceMetric
|
8 |
+
|
9 |
+
|
10 |
+
class LLMAsJudge(BulkInstanceMetric):
|
11 |
+
"""LLM as judge based metric class for evaluating correctness.
|
12 |
+
|
13 |
+
Attributes:
|
14 |
+
main_score (str): The main score used for evaluation.
|
15 |
+
reduction_map (dict): A dictionary specifying the reduction method for the metric.
|
16 |
+
betch_size (int): The size of the bulk.
|
17 |
+
recipe (str): The unitxt recipe that will be used to create the judge dataset.
|
18 |
+
inference (InferenceEngine): the module that creates the inference.
|
19 |
+
|
20 |
+
Methods:
|
21 |
+
prepare(self): Initialization method for the metric.
|
22 |
+
compute(self, references, predictions, additional_inputs): Method to compute the metric.
|
23 |
+
|
24 |
+
Usage:
|
25 |
+
metric = LlamaIndexCorrectnessMetric()
|
26 |
+
scores = metric.compute(references, prediction, additional_inputs)
|
27 |
+
"""
|
28 |
+
|
29 |
+
main_score: str = "llm_as_judge"
|
30 |
+
reduction_map: Dict[str, List[str]] = None
|
31 |
+
batch_size: int = 32
|
32 |
+
recipe: str
|
33 |
+
inference_model: InferenceEngine
|
34 |
+
|
35 |
+
def prepare(self):
|
36 |
+
super().prepare()
|
37 |
+
if self.reduction_map is None:
|
38 |
+
self.reduction_map = {"mean": [self.main_score]}
|
39 |
+
|
40 |
+
def compute(
|
41 |
+
self,
|
42 |
+
references: List[List[Any]],
|
43 |
+
predictions: List[Any],
|
44 |
+
task_data: List[Dict],
|
45 |
+
) -> List[Dict[str, Any]]:
|
46 |
+
instances = [
|
47 |
+
{
|
48 |
+
**task_data_instance,
|
49 |
+
**{"model_output": prediction, "rating_label": "[[5]]"},
|
50 |
+
}
|
51 |
+
for task_data_instance, prediction in zip(task_data, predictions)
|
52 |
+
]
|
53 |
+
|
54 |
+
dataset = produce(instances, self.recipe)
|
55 |
+
verdicts = self.inference_model.infer(dataset)
|
56 |
+
meta_metric = evaluate.load("unitxt/metric")
|
57 |
+
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
|
58 |
+
return [{self.main_score: instance["prediction"]} for instance in meta_scores]
|