Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 13, 2024

Commit

cb336b5

verified ·

1 Parent(s): 0870ca2

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +265 -35

metrics.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import string
 import uuid
@@ -49,8 +50,6 @@ def abstract_field():
 def nan_mean(x):
-    import warnings
     with warnings.catch_warnings():
         # final mean should be mean of scores, ignoring NaN, hence nanmean
         # but if the group function values is NaN for ALL values, nanmean throws a
@@ -70,7 +69,6 @@ class UpdateStream(StreamInstanceOperator):
         return instance
-# TODO: currently we have two classes with this name. metric.Metric and matrics.Metric...
 class Metric(Artifact):
     @property
     @abstractmethod
@@ -115,10 +113,6 @@ class Metric(Artifact):
     def disable_confidence_interval_calculation(self):
         pass
-    @abstractmethod
-    def set_n_resamples(self, n_resample):
-        pass
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
@@ -135,12 +129,7 @@ class MetricWithConfidenceInterval(Metric):
         return np.random.default_rng(hash(get_seed()) & _max_32bit)
     def disable_confidence_interval_calculation(self):
-        n = self.n_resamples
         self.n_resamples = None
-        return n
-    def set_n_resamples(self, n_resamples):
-        self.n_resamples = n_resamples
     def _can_compute_confidence_intervals(self, num_predictions):
         return (
@@ -161,6 +150,17 @@ class MetricWithConfidenceInterval(Metric):
             [instance["score"]["instance"][score_name] for instance in instances]
         )
     def score_based_confidence_interval(
         self,
         instances: List[dict],
@@ -197,6 +197,11 @@ class MetricWithConfidenceInterval(Metric):
             #   that is, re-form the groups, calculate the function, and take the mean of the group scores
             aggregation_func = self.average_item_scores
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
@@ -300,13 +305,18 @@ class MetricWithConfidenceInterval(Metric):
         num_predictions = len(predictions)
         if self._can_compute_confidence_intervals(num_predictions=num_predictions):
             identifiers = list(range(num_predictions))
-            ci = bootstrap(
-                (identifiers,),
-                statistic=statistic,
-                n_resamples=self.n_resamples,
-                confidence_level=self.confidence_level,
-                random_state=random_gen,
-            ).confidence_interval
             result["score_ci_low"] = ci.low
             result["score_ci_high"] = ci.high
             result[f"{score_name}_ci_low"] = ci.low
@@ -553,7 +563,7 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         - an 'agg_func' field with value being a 3-element list where
             - 1st element is a string name of the aggregation function (used in naming the CI report)
             - 2nd element is the callable aggregation function
-            - 3rd element is a Boolean indicator of whether, during boostrap CI calculation, the groups are to be sampled as single units.
                 If True, the group scores are calculated and then resampled.  This treats the group units as the unit of
                 interest for which the CI is being compared.
                 If False, the instances are resampled individually, and the groups determined
@@ -903,11 +913,7 @@ class MetricPipeline(MultiStreamOperator, Metric):
     metric: Metric = None
     def disable_confidence_interval_calculation(self):
-        return self.metric.disable_confidence_interval_calculation()
-    def set_n_resamples(self, n_resample):
-        if isinstance(self.metric, MetricWithConfidenceInterval):
-            self.metric.set_n_resamples(n_resample)
     def verify(self):
         assert self.main_score is not None, "main_score is not set"
@@ -1092,6 +1098,11 @@ class F1(GlobalMetric):
             self.id_to_str[id] = str
         return self.str_to_id[str]
     def compute(
         self,
         references: List[List[str]],
@@ -1101,6 +1112,9 @@ class F1(GlobalMetric):
         assert all(
             len(reference) == 1 for reference in references
         ), "Only a single reference per prediction is allowed in F1 metric"
         self.str_to_id = {}
         self.id_to_str = {}
         formatted_references = [
@@ -1111,18 +1125,21 @@ class F1(GlobalMetric):
             self.get_str_id(prediction) for prediction in predictions
         ]
         labels = list(set(formatted_references))
         result = self._metric.compute(
             predictions=formatted_predictions,
             references=formatted_references,
             labels=labels,
             average=self.average,
         )
-        if isinstance(result["f1"], numpy.ndarray):
-            final_result = {self.main_score: mean(result["f1"])}
             for i, label in enumerate(labels):
-                final_result["f1_" + self.id_to_str[label]] = result["f1"][i]
         else:
-            final_result = {self.main_score: result["f1"]}
         return final_result
@@ -1131,6 +1148,40 @@ class F1Micro(F1):
     average = "micro"
 class F1Macro(F1):
     main_score = "f1_macro"
@@ -1442,8 +1493,10 @@ class RocAuc(GlobalMetric):
         references = [to_float_or_default(r) for r in references]
         predictions = [to_float_or_default(p) for p in predictions]
-        fpr, tpr, thrs = self.roc_curve(y_true=references, y_score=predictions)
-        roc_auc = self.auc(fpr, tpr)
         return {self.main_score: roc_auc}
@@ -1525,7 +1578,7 @@ class CustomF1(GlobalMetric):
         assert len(references) == len(predictions), (
             f"references size ({len(references)})"
-            f" doesn't mach predictions sise ({len(references)})."
         )
         if self.groups is None:
@@ -1700,7 +1753,7 @@ class SentenceBert(BulkInstanceMetric):
     model_name: str
-    _requirements_list: List[str] = ["sentence_transformers"]
     def prepare(self):
         super().prepare()
@@ -1751,7 +1804,7 @@ class Reward(BulkInstanceMetric):
     model_name: str
-    _requirements_list: List[str] = ["transformers"]
     def prepare(self):
         super().prepare()
@@ -1782,6 +1835,134 @@ class Reward(BulkInstanceMetric):
         return self.pipe(inputs, batch_size=self.batch_size)
 class Perplexity(BulkInstanceMetric):
     """Computes the likelihood of generating text Y after text X - P(Y|X)."""
@@ -1793,7 +1974,7 @@ class Perplexity(BulkInstanceMetric):
     batch_size: int = 32
     model_name: str
-    _requirements_list: List[str] = ["transformers"]
     def compute(
         self,
@@ -2904,3 +3085,52 @@ class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment):
             ],
         }
     }

+import itertools
 import re
 import string
 import uuid
 def nan_mean(x):
     with warnings.catch_warnings():
         # final mean should be mean of scores, ignoring NaN, hence nanmean
         # but if the group function values is NaN for ALL values, nanmean throws a
         return instance
 class Metric(Artifact):
     @property
     @abstractmethod
     def disable_confidence_interval_calculation(self):
         pass
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
         return np.random.default_rng(hash(get_seed()) & _max_32bit)
     def disable_confidence_interval_calculation(self):
         self.n_resamples = None
     def _can_compute_confidence_intervals(self, num_predictions):
         return (
             [instance["score"]["instance"][score_name] for instance in instances]
         )
+    @staticmethod
+    def _all_instance_scores_equal(instances, score_name):
+        instance_scores = [
+            instance["score"]["instance"][score_name] for instance in instances
+        ]
+        non_nan_instance_scores = [
+            score for score in instance_scores if score is not np.nan
+        ]
+        num_unique_scores = len(set(non_nan_instance_scores))
+        return num_unique_scores == 1
     def score_based_confidence_interval(
         self,
         instances: List[dict],
             #   that is, re-form the groups, calculate the function, and take the mean of the group scores
             aggregation_func = self.average_item_scores
         for score_name in score_names:
+            # If all computed instance level scores are the same, there is no point in computing
+            # confidence intervals. So skip to the next score.
+            if self._all_instance_scores_equal(instances, score_name):
+                continue
             # need to redefine the statistic function within the loop because score_name is a loop variable
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
         num_predictions = len(predictions)
         if self._can_compute_confidence_intervals(num_predictions=num_predictions):
             identifiers = list(range(num_predictions))
+            with warnings.catch_warnings():
+                # Avoid RuntimeWarning in bootstrap computation. This happens on small datasets where
+                # the value of the computed global metric is the same on all resamplings.
+                warnings.simplefilter("ignore", category=RuntimeWarning)
+                ci = bootstrap(
+                    (identifiers,),
+                    statistic=statistic,
+                    n_resamples=self.n_resamples,
+                    confidence_level=self.confidence_level,
+                    random_state=random_gen,
+                ).confidence_interval
             result["score_ci_low"] = ci.low
             result["score_ci_high"] = ci.high
             result[f"{score_name}_ci_low"] = ci.low
         - an 'agg_func' field with value being a 3-element list where
             - 1st element is a string name of the aggregation function (used in naming the CI report)
             - 2nd element is the callable aggregation function
+            - 3rd element is a Boolean indicator of whether, during bootstrap CI calculation, the groups are to be sampled as single units.
                 If True, the group scores are calculated and then resampled.  This treats the group units as the unit of
                 interest for which the CI is being compared.
                 If False, the instances are resampled individually, and the groups determined
     metric: Metric = None
     def disable_confidence_interval_calculation(self):
+        self.metric.disable_confidence_interval_calculation()
     def verify(self):
         assert self.main_score is not None, "main_score is not set"
             self.id_to_str[id] = str
         return self.str_to_id[str]
+    def _labels_match_average_format(
+        self, references: List[List[str]], predictions: List[str]
+    ):
+        return True
     def compute(
         self,
         references: List[List[str]],
         assert all(
             len(reference) == 1 for reference in references
         ), "Only a single reference per prediction is allowed in F1 metric"
+        if not self._labels_match_average_format(references, predictions):
+            return {self.main_score: np.nan}
         self.str_to_id = {}
         self.id_to_str = {}
         formatted_references = [
             self.get_str_id(prediction) for prediction in predictions
         ]
         labels = list(set(formatted_references))
         result = self._metric.compute(
             predictions=formatted_predictions,
             references=formatted_references,
             labels=labels,
             average=self.average,
         )
+        if isinstance(result[self.metric], numpy.ndarray):
+            final_result = {self.main_score: mean(result[self.metric])}
             for i, label in enumerate(labels):
+                final_result[f"{self.metric}_" + self.id_to_str[label]] = result[
+                    self.metric
+                ][i]
         else:
+            final_result = {self.main_score: result[self.metric]}
         return final_result
     average = "micro"
+class F1Binary(F1):
+    process_single_instances = False
+    main_score = "f1_binary"
+    average = "binary"
+    pos_classes = {"1", "1.0", "yes", "true"}
+    def get_str_id(self, str):
+        if str.lower() in self.pos_classes:
+            return 1
+        return 0
+    # References and predictions must include up to 2 unique values, one of them in pos_classes
+    def _labels_match_average_format(
+        self, references: List[List[str]], predictions: List[str]
+    ):
+        classes = set(predictions + list(itertools.chain(*references)))
+        n_classes = len(classes)
+        if n_classes > 2:
+            return False
+        if n_classes == 2 and len(set(classes).difference(self.pos_classes)) == 0:
+            return False
+        return True
+class RecallBinary(F1Binary):
+    main_score = "recall_binary"
+    metric = "recall"
+class PrecisionBinary(F1Binary):
+    main_score = "precision_binary"
+    metric = "precision"
 class F1Macro(F1):
     main_score = "f1_macro"
         references = [to_float_or_default(r) for r in references]
         predictions = [to_float_or_default(p) for p in predictions]
+        false_positive_rates, true_positive_rates, _ = self.roc_curve(
+            y_true=references, y_score=predictions
+        )
+        roc_auc = self.auc(false_positive_rates, true_positive_rates)
         return {self.main_score: roc_auc}
         assert len(references) == len(predictions), (
             f"references size ({len(references)})"
+            f" doesn't mach predictions size ({len(references)})."
         )
         if self.groups is None:
     model_name: str
+    _requirements_list: List[str] = ["sentence_transformers", "torch", "transformers"]
     def prepare(self):
         super().prepare()
     model_name: str
+    _requirements_list: List[str] = ["transformers", "torch"]
     def prepare(self):
         super().prepare()
         return self.pipe(inputs, batch_size=self.batch_size)
+class LlamaIndexCorrectness(InstanceMetric):
+    """LlamaIndex based metric class for evaluating correctness.
+    Attributes:
+        reduction_map (dict): A dictionary specifying the reduction method for the metric.
+        main_score (str): The main score used for evaluation.
+        _requirements_list (List[str]): A list specifying any additional requirements for the metric.
+    Methods:
+        prepare(self): Initialization method for the metric.
+        compute(self, references, predictions, additional_inputs): Method to compute the metric.
+    Usage:
+        metric = LlamaIndexCorrectnessMetric()
+        scores = metric.compute(references, prediction, additional_inputs)
+    """
+    model_name: str = ""
+    main_score: str = ""
+    reduction_map: Dict[str, List[str]] = None
+    openai_models: List[str] = ["gpt-3.5-turbo"]
+    anthropic_models: List[
+        str
+    ] = []  # this is here for the sake of documentation for future models
+    mock_models: List[str] = ["mock"]
+    external_api_models = openai_models + anthropic_models
+    _requirements_list: List[str] = ["llama_index"]
+    @staticmethod
+    def _custom_parser(eval_response: str):
+        """Default parser function for evaluation response.
+        Args:
+            eval_response (str): The response string from the evaluation.
+        Returns:
+            Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
+        """
+        score_str = eval_response.split("\n")[0]
+        reasoning_str = "\n".join(eval_response.split("\n")[1:])
+        score = float(score_str)
+        reasoning = reasoning_str.lstrip("\n")
+        return score, reasoning
+    def _model_using_extrnal_api(self):
+        return self.model_name in self.external_api_models
+    def prepare(self):
+        """Initialization method for the metric. Initializes the CorrectnessEvaluator with the OpenAI model."""
+        super().prepare()
+        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
+        self.main_score: str = (
+            f"correctness_llama_index_by_{self.model_name_normalized}_judge"
+        )
+        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
+        from llama_index.core.evaluation import CorrectnessEvaluator
+        if self.model_name in self.openai_models:
+            from llama_index.llms.openai import OpenAI
+            llm = OpenAI("gpt-3.5-turbo")
+        elif self.model_name in self.mock_models:
+            from llama_index.core.llms.mock import MockLLM
+            llm = MockLLM(system_prompt="5")  # perfect score
+        else:
+            raise NotImplementedError(
+                f"LlamaIndexCorrectnessMetric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
+            )
+        self.evaluator = CorrectnessEvaluator(
+            llm=llm, parser_function=self._custom_parser
+        )
+    def compute(
+        self,
+        references: List[str],
+        prediction: str,
+        task_data: Dict,
+    ) -> Dict[str, Any]:
+        """Method to compute the correctness metric.
+        Args:
+            references (List[str]): List of reference instances.
+            prediction (str): List of predicted instances.
+            task_data (Dict): List of additional input data.
+        Returns:
+            Dict[str, Any]: List of computed scores and feedback.
+        Raises:
+            AssertionError: If the input does not meet the expected format.
+        """
+        # treat the references as the questions and the predictions as answers
+        # assume a single reference
+        assert (
+            not self._model_using_extrnal_api()
+            or settings.allow_passing_data_to_remote_api
+        ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False.  Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
+        query = task_data["question"]
+        contexts = task_data["contexts"]
+        per_reference_results = []
+        for reference_response in references:
+            per_reference_results.append(
+                self.evaluator.evaluate(
+                    query=query,
+                    response=prediction,
+                    contexts=contexts,
+                    reference=reference_response,
+                )
+            )
+        result = max([results.score for results in per_reference_results])
+        return {
+            self.main_score: result / 5,
+            # "score_name": self.main_score,
+            # "feedback": result.feedback, # removed since this cannot be tested
+        }
 class Perplexity(BulkInstanceMetric):
     """Computes the likelihood of generating text Y after text X - P(Y|X)."""
     batch_size: int = 32
     model_name: str
+    _requirements_list: List[str] = ["transformers", "torch"]
     def compute(
         self,
             ],
         }
     }
+class BinaryMaxF1(F1Binary):
+    main_score = "max_f1_binary"
+    def compute(
+        self,
+        references: List[List[str]],
+        predictions: List[List[str]],
+        task_data: List[Dict],
+    ) -> dict:
+        assert all(
+            len(reference) == 1 for reference in references
+        ), "Only a single reference per prediction is allowed in F1 metric"
+        classes = set(itertools.chain(*references))
+        n_clases = len(classes)
+        assert len(classes) <= 2, "References of BinaryMaxF1 must be binary"
+        pos_classes = classes.intersection(self.pos_classes)
+        neg_classes = classes.difference(self.pos_classes)
+        n_pos_classes = len(pos_classes)
+        if n_clases == 2:
+            assert (
+                n_pos_classes == 1
+            ), "Only one positive class is allowed in BinaryMaxF1"
+        pos_class = next(iter(pos_classes)) if n_pos_classes > 0 else "1.0"
+        neg_class = next(iter(neg_classes)) if len(neg_classes) > 0 else "0.0"
+        float_predictions = []
+        for prediction in predictions:
+            try:
+                float_predictions.append(float(prediction))
+            except Exception:
+                float_predictions.append(0)
+        best_thr = -1
+        best_f1 = -1
+        for thr in set(float_predictions):
+            new_predictions = [
+                pos_class if float_prediction >= thr else neg_class
+                for float_prediction in float_predictions
+            ]
+            f1 = super().compute(references, new_predictions, task_data)[
+                self.main_score
+            ]
+            if f1 > best_f1:
+                best_f1 = f1
+                best_thr = thr
+        return {self.main_score: best_f1, "best_thr_maxf1": best_thr}