Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Minseok Bae commited on Jan 4, 2024

Commit

404587d

1 Parent(s): 3b66490

Edited README and removed error-rate metric

Browse files

Files changed (8) hide show

README.md +1 -1
main_backend.py +5 -1
src/backend/evaluate_model.py +8 -5
src/backend/manage_requests.py +2 -1
src/backend/model_operations.py +19 -19
src/backend/util.py +10 -14
src/display/about.py +22 -4
src/display/utils.py +0 -15

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: HEM Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo

 ---
+title: H2EM Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo

main_backend.py CHANGED Viewed

@@ -35,11 +35,13 @@ def run_auto_eval():
         hf_repo_results=envs.RESULTS_REPO,
         local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
     )
     eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
                                                     hf_repo=envs.QUEUE_REPO,
                                                     local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
     eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
     print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
@@ -57,6 +59,7 @@ def run_auto_eval():
         hf_repo=envs.QUEUE_REPO,
         local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
     )
     run_eval_suite.run_evaluation(
         eval_request=eval_request,
@@ -66,6 +69,7 @@ def run_auto_eval():
         device=envs.DEVICE,
         no_cache=True,
     )
 if __name__ == "__main__":

         hf_repo_results=envs.RESULTS_REPO,
         local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
     )
+    logging.info("Checked completed evals")
     eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
                                                     hf_repo=envs.QUEUE_REPO,
                                                     local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
+    logging.info("Got eval requests")
     eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
+    logging.info("Sorted eval requests")
     print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
         hf_repo=envs.QUEUE_REPO,
         local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
     )
+    logging.info("Set eval request to running, now running eval")
     run_eval_suite.run_evaluation(
         eval_request=eval_request,
         device=envs.DEVICE,
         no_cache=True,
     )
+    logging.info("Eval finished, now setting status to finished")
 if __name__ == "__main__":

src/backend/evaluate_model.py CHANGED Viewed

@@ -75,20 +75,23 @@ class Evaluator:
             avg_summary_len = self.summary_generator.avg_length
             answer_rate = self.summary_generator.answer_rate
-            error_rate = self.summary_generator.error_rate
             hallucination_scores = self.eval_model.evaluate_hallucination(
                 generated_summaries_df)
-            accuracy = self.eval_model.compute_accuracy()
             hallucination_rate = self.eval_model.hallucination_rate
             results = util.format_results(model_name=self.model, revision=self.revision,
-                                        precision=self.precision, accuracy=accuracy,
-                hallucination_rate=hallucination_rate, answer_rate=answer_rate,
-                avg_summary_len=avg_summary_len, error_rate=error_rate)
             return results
         except FileNotFoundError:
             logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
             raise
         except Exception as e:

             avg_summary_len = self.summary_generator.avg_length
             answer_rate = self.summary_generator.answer_rate
+            # error_rate = self.summary_generator.error_rate
             hallucination_scores = self.eval_model.evaluate_hallucination(
                 generated_summaries_df)
+            factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
             hallucination_rate = self.eval_model.hallucination_rate
             results = util.format_results(model_name=self.model, revision=self.revision,
+                                        precision=self.precision,
+                                        factual_consistency_rate=factual_consistency_rate,
+                                        hallucination_rate=hallucination_rate,
+                                        answer_rate=answer_rate,
+                                        avg_summary_len=avg_summary_len)
             return results
         except FileNotFoundError:
+            # logging.error(f"File not found: {envs.SOURCE_PATH}")
             logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
             raise
         except Exception as e:

src/backend/manage_requests.py CHANGED Viewed

@@ -9,9 +9,10 @@ from huggingface_hub import HfApi, snapshot_download
 @dataclass
 class EvalRequest:
     model: str
-    private: bool
     status: str
     json_filepath: str
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16

 @dataclass
 class EvalRequest:
     model: str
+    # private: bool
     status: str
     json_filepath: str
+    private: bool = False
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16

src/backend/model_operations.py CHANGED Viewed

@@ -111,7 +111,7 @@ class SummaryGenerator:
                                         columns=["source", "summary", "dataset"])
         self._compute_avg_length()
         self._compute_answer_rate()
-        self._compute_error_rate(error_count)
         return self.summaries_df
@@ -140,13 +140,13 @@ class SummaryGenerator:
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
-    def _compute_error_rate(self, count):
-        """
-        Compute the error rate of summaries.
-        """
-        total_rows = len(self.summaries_df)
-        self.error_rate = 0 if total_rows == 0 else count / total_rows
 class EvaluationModel:
@@ -168,7 +168,7 @@ class EvaluationModel:
         """
         self.model = load_evaluation_model(model_path)
         self.scores = []
-        self.accuracy = None
         self.hallucination_rate = None
     def evaluate_hallucination(self, summaries_df):
@@ -192,15 +192,15 @@ class EvaluationModel:
             logging.error(f"Error evaluating hallucination: {e}")
             raise
-    def compute_accuracy(self, threshold=0.5):
         """
-        Compute the accuracy of the evaluated summaries based on the previously calculated scores.
-        This method relies on the 'scores' attribute being populated, typically via the
-        'evaluate_hallucination' method.
         Returns:
-            float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
-                    attributes of the instance.
         Raises:
             ValueError: If scores have not been calculated prior to calling this method.
@@ -210,15 +210,15 @@ class EvaluationModel:
             logging.error(error_msg)
             raise ValueError(error_msg)
-        # Use threshold of 0.5 to compute accuracy
         num_above_threshold = sum(score >= threshold for score in self.scores)
         num_total = len(self.scores)
         if not num_total:
-            raise ValueError("No scores available to compute accuracy.")
-        self.accuracy = (num_above_threshold / num_total) * 100
-        self.hallucination_rate = 100 - self.accuracy
-        return self.accuracy

                                         columns=["source", "summary", "dataset"])
         self._compute_avg_length()
         self._compute_answer_rate()
+        # self._compute_error_rate(error_count)
         return self.summaries_df
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
+    # def _compute_error_rate(self, count):
+    #     """
+    #     Compute the error rate of summaries.
+    #     """
+    #     total_rows = len(self.summaries_df)
+    #     self.error_rate = 0 if total_rows == 0 else count / total_rows
 class EvaluationModel:
         """
         self.model = load_evaluation_model(model_path)
         self.scores = []
+        self.factual_consistency_rate = None
         self.hallucination_rate = None
     def evaluate_hallucination(self, summaries_df):
             logging.error(f"Error evaluating hallucination: {e}")
             raise
+    def compute_factual_consistency_rate(self, threshold=0.5):
         """
+        Compute the factual consistency rate of the evaluated summaries based on
+        the previously calculated scores. This method relies on the 'scores'
+        attribute being populated, typically via the 'evaluate_hallucination' method.
         Returns:
+            float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
+            and 'hallucination_rate' attributes of the instance.
         Raises:
             ValueError: If scores have not been calculated prior to calling this method.
             logging.error(error_msg)
             raise ValueError(error_msg)
+        # Use threshold of 0.5 to compute factual_consistency_rate
         num_above_threshold = sum(score >= threshold for score in self.scores)
         num_total = len(self.scores)
         if not num_total:
+            raise ValueError("No scores available to compute factual consistency rate.")
+        self.factual_consistency_rate = (num_above_threshold / num_total) * 100
+        self.hallucination_rate = 100 - self.factual_consistency_rate
+        return self.factual_consistency_rate

src/backend/util.py CHANGED Viewed

@@ -17,9 +17,9 @@ def generate_prompt(source_passage: str) -> str:
     """
-def format_results(model_name: str, revision: str, precision: str, accuracy: float,
-                hallucination_rate: float, answer_rate: float, avg_summary_len: float,
-                error_rate: float) -> dict:
     """
     Formats the evaluation results into a structured dictionary.
@@ -27,11 +27,10 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
         model_name (str): The name of the evaluated model.
         revision (str): The revision hash of the model.
         precision (str): The precision with which the evaluation was run.
-        accuracy (float): The accuracy score from the evaluation.
-        hallucination_rate (float): The hallucination rate from the evaluation.
-        answer_rate (float): The answer rate from the evaluation.
-        avg_summary_len (float): The average summary length from the evaluation.
-        error_rate (float): The rate at which errors occurred during summary generation.
     Returns:
         dict: A dictionary containing the structured evaluation results.
@@ -43,21 +42,18 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
             "model_sha": revision # Hash of the model
         },
         "results": {
-            "accuracy": {
-                "accuracy": accuracy
-            },
             "hallucination_rate": {
                 "hallucination_rate": hallucination_rate
             },
             "answer_rate": {
                 "answer_rate": answer_rate
             },
             "average_summary_length": {
                 "average_summary_length": avg_summary_len
             },
-            "error_rate": {
-                "error_rate": error_rate
-            }
         }
     }

     """
+def format_results(model_name: str, revision: str, precision: str,
+                factual_consistency_rate: float, hallucination_rate: float,
+                answer_rate: float, avg_summary_len: float) -> dict:
     """
     Formats the evaluation results into a structured dictionary.
         model_name (str): The name of the evaluated model.
         revision (str): The revision hash of the model.
         precision (str): The precision with which the evaluation was run.
+        factual_consistency_rate (float): The factual consistency rate.
+        hallucination_rate (float): The hallucination rate.
+        answer_rate (float): The answer rate.
+        avg_summary_len (float): The average summary length.
     Returns:
         dict: A dictionary containing the structured evaluation results.
             "model_sha": revision # Hash of the model
         },
         "results": {
             "hallucination_rate": {
                 "hallucination_rate": hallucination_rate
             },
+            "factual_consistency_rate": {
+                "factual_consistency_rate": factual_consistency_rate
+            },
             "answer_rate": {
                 "answer_rate": answer_rate
             },
             "average_summary_length": {
                 "average_summary_length": avg_summary_len
             },
         }
     }

src/display/about.py CHANGED Viewed

@@ -10,26 +10,44 @@ class Task:
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    accuracy = Task("accuracy", "accuracy", "Accuracy")
     hallucination_rate = Task("hallucination_rate",
                             "hallucination_rate", "Hallucination Rate")
     answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
     average_summary_length = Task("average_summary_length",
                                 "average_summary_length", "Average Summary Length")
-    error_rate = Task("error_rate", "error_rate", "Error Rate")
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This Leaderboard evaluates how much easy LLM hallucinates in factual summarization.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
 ## How it works
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     hallucination_rate = Task("hallucination_rate",
                             "hallucination_rate", "Hallucination Rate")
+    accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
     answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
     average_summary_length = Task("average_summary_length",
                                 "average_summary_length", "Average Summary Length")
+    # error_rate = Task("error_rate", "error_rate", "Error Rate")
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation (H2EM) Model leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+This leaderboard evaluates how often an LLM introduces hallucinations when summarizing a document.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
 ## How it works
+Using Vectara's H2EM (Hughes Hallucination Evaluation Model), we evaluate how often an LLM introduces hallucinations when summarizing a document.
+The model card for H2EM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
+Given a document and a summary generated by an LLM, H2EM outputs a hallucination score between 0 and 1, where 0 means hallucination and 1 indicates no hallucination, or perfect factual consistency with the document.
+Our evaluation dataset is composed of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
+We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
+## Understand each metric
+### - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
+### - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
+### - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
+### - Average Summary Length: The average number of words in the generated summaries
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

src/display/utils.py CHANGED Viewed

@@ -30,21 +30,6 @@ auto_eval_column_dict.append(["model", ColumnContent,
                             ColumnContent("Model", "markdown", True, never_hidden=True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# # Accuracy
-# auto_eval_column_dict.append(["accuracy", ColumnContent,
-#                             ColumnContent("Accuracy", "number", True)])
-# # Hallucination Rate
-# auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
-#                             ColumnContent("Hallucination Rate", "number", True)])
-# # Answer Rate
-# auto_eval_column_dict.append(["answer_rate", ColumnContent,
-#                             ColumnContent("Answer Rate", "number", True)])
-# # Average Summary Length
-# auto_eval_column_dict.append(["average_summary_length", ColumnContent,
-#                             ColumnContent("Average Summary Length", "number", True)])
-# # Error Rate
-# auto_eval_column_dict.append(["error_rate", ColumnContent,
-#                             ColumnContent("Error Rate", "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])

                             ColumnContent("Model", "markdown", True, never_hidden=True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])