Minseok Bae commited on
Commit
404587d
1 Parent(s): 3b66490

Edited README and removed error-rate metric

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: HEM Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
 
1
  ---
2
+ title: H2EM Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
main_backend.py CHANGED
@@ -35,11 +35,13 @@ def run_auto_eval():
35
  hf_repo_results=envs.RESULTS_REPO,
36
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
37
  )
38
-
39
  eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
40
  hf_repo=envs.QUEUE_REPO,
41
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
 
42
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
 
43
 
44
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
45
 
@@ -57,6 +59,7 @@ def run_auto_eval():
57
  hf_repo=envs.QUEUE_REPO,
58
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
59
  )
 
60
 
61
  run_eval_suite.run_evaluation(
62
  eval_request=eval_request,
@@ -66,6 +69,7 @@ def run_auto_eval():
66
  device=envs.DEVICE,
67
  no_cache=True,
68
  )
 
69
 
70
 
71
  if __name__ == "__main__":
 
35
  hf_repo_results=envs.RESULTS_REPO,
36
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
37
  )
38
+ logging.info("Checked completed evals")
39
  eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
40
  hf_repo=envs.QUEUE_REPO,
41
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
42
+ logging.info("Got eval requests")
43
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
44
+ logging.info("Sorted eval requests")
45
 
46
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
47
 
 
59
  hf_repo=envs.QUEUE_REPO,
60
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
61
  )
62
+ logging.info("Set eval request to running, now running eval")
63
 
64
  run_eval_suite.run_evaluation(
65
  eval_request=eval_request,
 
69
  device=envs.DEVICE,
70
  no_cache=True,
71
  )
72
+ logging.info("Eval finished, now setting status to finished")
73
 
74
 
75
  if __name__ == "__main__":
src/backend/evaluate_model.py CHANGED
@@ -75,20 +75,23 @@ class Evaluator:
75
 
76
  avg_summary_len = self.summary_generator.avg_length
77
  answer_rate = self.summary_generator.answer_rate
78
- error_rate = self.summary_generator.error_rate
79
 
80
  hallucination_scores = self.eval_model.evaluate_hallucination(
81
  generated_summaries_df)
82
- accuracy = self.eval_model.compute_accuracy()
83
  hallucination_rate = self.eval_model.hallucination_rate
84
 
85
  results = util.format_results(model_name=self.model, revision=self.revision,
86
- precision=self.precision, accuracy=accuracy,
87
- hallucination_rate=hallucination_rate, answer_rate=answer_rate,
88
- avg_summary_len=avg_summary_len, error_rate=error_rate)
 
 
89
 
90
  return results
91
  except FileNotFoundError:
 
92
  logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
93
  raise
94
  except Exception as e:
 
75
 
76
  avg_summary_len = self.summary_generator.avg_length
77
  answer_rate = self.summary_generator.answer_rate
78
+ # error_rate = self.summary_generator.error_rate
79
 
80
  hallucination_scores = self.eval_model.evaluate_hallucination(
81
  generated_summaries_df)
82
+ factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
83
  hallucination_rate = self.eval_model.hallucination_rate
84
 
85
  results = util.format_results(model_name=self.model, revision=self.revision,
86
+ precision=self.precision,
87
+ factual_consistency_rate=factual_consistency_rate,
88
+ hallucination_rate=hallucination_rate,
89
+ answer_rate=answer_rate,
90
+ avg_summary_len=avg_summary_len)
91
 
92
  return results
93
  except FileNotFoundError:
94
+ # logging.error(f"File not found: {envs.SOURCE_PATH}")
95
  logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
96
  raise
97
  except Exception as e:
src/backend/manage_requests.py CHANGED
@@ -9,9 +9,10 @@ from huggingface_hub import HfApi, snapshot_download
9
  @dataclass
10
  class EvalRequest:
11
  model: str
12
- private: bool
13
  status: str
14
  json_filepath: str
 
15
  weight_type: str = "Original"
16
  model_type: str = "" # pretrained, finetuned, with RL
17
  precision: str = "" # float16, bfloat16
 
9
  @dataclass
10
  class EvalRequest:
11
  model: str
12
+ # private: bool
13
  status: str
14
  json_filepath: str
15
+ private: bool = False
16
  weight_type: str = "Original"
17
  model_type: str = "" # pretrained, finetuned, with RL
18
  precision: str = "" # float16, bfloat16
src/backend/model_operations.py CHANGED
@@ -111,7 +111,7 @@ class SummaryGenerator:
111
  columns=["source", "summary", "dataset"])
112
  self._compute_avg_length()
113
  self._compute_answer_rate()
114
- self._compute_error_rate(error_count)
115
 
116
  return self.summaries_df
117
 
@@ -140,13 +140,13 @@ class SummaryGenerator:
140
 
141
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
142
 
143
- def _compute_error_rate(self, count):
144
- """
145
- Compute the error rate of summaries.
146
- """
147
- total_rows = len(self.summaries_df)
148
 
149
- self.error_rate = 0 if total_rows == 0 else count / total_rows
150
 
151
 
152
  class EvaluationModel:
@@ -168,7 +168,7 @@ class EvaluationModel:
168
  """
169
  self.model = load_evaluation_model(model_path)
170
  self.scores = []
171
- self.accuracy = None
172
  self.hallucination_rate = None
173
 
174
  def evaluate_hallucination(self, summaries_df):
@@ -192,15 +192,15 @@ class EvaluationModel:
192
  logging.error(f"Error evaluating hallucination: {e}")
193
  raise
194
 
195
- def compute_accuracy(self, threshold=0.5):
196
  """
197
- Compute the accuracy of the evaluated summaries based on the previously calculated scores.
198
- This method relies on the 'scores' attribute being populated, typically via the
199
- 'evaluate_hallucination' method.
200
 
201
  Returns:
202
- float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
203
- attributes of the instance.
204
 
205
  Raises:
206
  ValueError: If scores have not been calculated prior to calling this method.
@@ -210,15 +210,15 @@ class EvaluationModel:
210
  logging.error(error_msg)
211
  raise ValueError(error_msg)
212
 
213
- # Use threshold of 0.5 to compute accuracy
214
  num_above_threshold = sum(score >= threshold for score in self.scores)
215
  num_total = len(self.scores)
216
 
217
  if not num_total:
218
- raise ValueError("No scores available to compute accuracy.")
219
 
220
- self.accuracy = (num_above_threshold / num_total) * 100
221
- self.hallucination_rate = 100 - self.accuracy
222
 
223
- return self.accuracy
224
 
 
111
  columns=["source", "summary", "dataset"])
112
  self._compute_avg_length()
113
  self._compute_answer_rate()
114
+ # self._compute_error_rate(error_count)
115
 
116
  return self.summaries_df
117
 
 
140
 
141
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
142
 
143
+ # def _compute_error_rate(self, count):
144
+ # """
145
+ # Compute the error rate of summaries.
146
+ # """
147
+ # total_rows = len(self.summaries_df)
148
 
149
+ # self.error_rate = 0 if total_rows == 0 else count / total_rows
150
 
151
 
152
  class EvaluationModel:
 
168
  """
169
  self.model = load_evaluation_model(model_path)
170
  self.scores = []
171
+ self.factual_consistency_rate = None
172
  self.hallucination_rate = None
173
 
174
  def evaluate_hallucination(self, summaries_df):
 
192
  logging.error(f"Error evaluating hallucination: {e}")
193
  raise
194
 
195
+ def compute_factual_consistency_rate(self, threshold=0.5):
196
  """
197
+ Compute the factual consistency rate of the evaluated summaries based on
198
+ the previously calculated scores. This method relies on the 'scores'
199
+ attribute being populated, typically via the 'evaluate_hallucination' method.
200
 
201
  Returns:
202
+ float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
203
+ and 'hallucination_rate' attributes of the instance.
204
 
205
  Raises:
206
  ValueError: If scores have not been calculated prior to calling this method.
 
210
  logging.error(error_msg)
211
  raise ValueError(error_msg)
212
 
213
+ # Use threshold of 0.5 to compute factual_consistency_rate
214
  num_above_threshold = sum(score >= threshold for score in self.scores)
215
  num_total = len(self.scores)
216
 
217
  if not num_total:
218
+ raise ValueError("No scores available to compute factual consistency rate.")
219
 
220
+ self.factual_consistency_rate = (num_above_threshold / num_total) * 100
221
+ self.hallucination_rate = 100 - self.factual_consistency_rate
222
 
223
+ return self.factual_consistency_rate
224
 
src/backend/util.py CHANGED
@@ -17,9 +17,9 @@ def generate_prompt(source_passage: str) -> str:
17
  """
18
 
19
 
20
- def format_results(model_name: str, revision: str, precision: str, accuracy: float,
21
- hallucination_rate: float, answer_rate: float, avg_summary_len: float,
22
- error_rate: float) -> dict:
23
  """
24
  Formats the evaluation results into a structured dictionary.
25
 
@@ -27,11 +27,10 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
27
  model_name (str): The name of the evaluated model.
28
  revision (str): The revision hash of the model.
29
  precision (str): The precision with which the evaluation was run.
30
- accuracy (float): The accuracy score from the evaluation.
31
- hallucination_rate (float): The hallucination rate from the evaluation.
32
- answer_rate (float): The answer rate from the evaluation.
33
- avg_summary_len (float): The average summary length from the evaluation.
34
- error_rate (float): The rate at which errors occurred during summary generation.
35
 
36
  Returns:
37
  dict: A dictionary containing the structured evaluation results.
@@ -43,21 +42,18 @@ def format_results(model_name: str, revision: str, precision: str, accuracy: flo
43
  "model_sha": revision # Hash of the model
44
  },
45
  "results": {
46
- "accuracy": {
47
- "accuracy": accuracy
48
- },
49
  "hallucination_rate": {
50
  "hallucination_rate": hallucination_rate
51
  },
 
 
 
52
  "answer_rate": {
53
  "answer_rate": answer_rate
54
  },
55
  "average_summary_length": {
56
  "average_summary_length": avg_summary_len
57
  },
58
- "error_rate": {
59
- "error_rate": error_rate
60
- }
61
  }
62
  }
63
 
 
17
  """
18
 
19
 
20
+ def format_results(model_name: str, revision: str, precision: str,
21
+ factual_consistency_rate: float, hallucination_rate: float,
22
+ answer_rate: float, avg_summary_len: float) -> dict:
23
  """
24
  Formats the evaluation results into a structured dictionary.
25
 
 
27
  model_name (str): The name of the evaluated model.
28
  revision (str): The revision hash of the model.
29
  precision (str): The precision with which the evaluation was run.
30
+ factual_consistency_rate (float): The factual consistency rate.
31
+ hallucination_rate (float): The hallucination rate.
32
+ answer_rate (float): The answer rate.
33
+ avg_summary_len (float): The average summary length.
 
34
 
35
  Returns:
36
  dict: A dictionary containing the structured evaluation results.
 
42
  "model_sha": revision # Hash of the model
43
  },
44
  "results": {
 
 
 
45
  "hallucination_rate": {
46
  "hallucination_rate": hallucination_rate
47
  },
48
+ "factual_consistency_rate": {
49
+ "factual_consistency_rate": factual_consistency_rate
50
+ },
51
  "answer_rate": {
52
  "answer_rate": answer_rate
53
  },
54
  "average_summary_length": {
55
  "average_summary_length": avg_summary_len
56
  },
 
 
 
57
  }
58
  }
59
 
src/display/about.py CHANGED
@@ -10,26 +10,44 @@ class Task:
10
 
11
  class Tasks(Enum):
12
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
- accuracy = Task("accuracy", "accuracy", "Accuracy")
14
  hallucination_rate = Task("hallucination_rate",
15
  "hallucination_rate", "Hallucination Rate")
 
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
  average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
19
- error_rate = Task("error_rate", "error_rate", "Error Rate")
20
 
 
 
21
  # Your leaderboard name
22
- TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
23
 
24
  # What does your leaderboard evaluate?
25
  INTRODUCTION_TEXT = """
26
- This Leaderboard evaluates how much easy LLM hallucinates in factual summarization.
 
 
27
  """
28
 
29
  # Which evaluations are you running? how can people reproduce what you have?
30
  LLM_BENCHMARKS_TEXT = """
31
  ## How it works
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  ## Reproducibility
34
  To reproduce our results, here is the commands you can run:
35
 
 
10
 
11
  class Tasks(Enum):
12
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
 
13
  hallucination_rate = Task("hallucination_rate",
14
  "hallucination_rate", "Hallucination Rate")
15
+ accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
  average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
19
+ # error_rate = Task("error_rate", "error_rate", "Error Rate")
20
 
21
+
22
+
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation (H2EM) Model leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ This leaderboard evaluates how often an LLM introduces hallucinations when summarizing a document.
29
+
30
+
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
  LLM_BENCHMARKS_TEXT = """
35
  ## How it works
36
 
37
+ Using Vectara's H2EM (Hughes Hallucination Evaluation Model), we evaluate how often an LLM introduces hallucinations when summarizing a document.
38
+
39
+ The model card for H2EM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
40
+ Given a document and a summary generated by an LLM, H2EM outputs a hallucination score between 0 and 1, where 0 means hallucination and 1 indicates no hallucination, or perfect factual consistency with the document.
41
+
42
+ Our evaluation dataset is composed of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
43
+ We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
44
+
45
+ ## Understand each metric
46
+ ### - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
47
+ ### - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
48
+ ### - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
49
+ ### - Average Summary Length: The average number of words in the generated summaries
50
+
51
  ## Reproducibility
52
  To reproduce our results, here is the commands you can run:
53
 
src/display/utils.py CHANGED
@@ -30,21 +30,6 @@ auto_eval_column_dict.append(["model", ColumnContent,
30
  ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
- # # Accuracy
34
- # auto_eval_column_dict.append(["accuracy", ColumnContent,
35
- # ColumnContent("Accuracy", "number", True)])
36
- # # Hallucination Rate
37
- # auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
38
- # ColumnContent("Hallucination Rate", "number", True)])
39
- # # Answer Rate
40
- # auto_eval_column_dict.append(["answer_rate", ColumnContent,
41
- # ColumnContent("Answer Rate", "number", True)])
42
- # # Average Summary Length
43
- # auto_eval_column_dict.append(["average_summary_length", ColumnContent,
44
- # ColumnContent("Average Summary Length", "number", True)])
45
- # # Error Rate
46
- # auto_eval_column_dict.append(["error_rate", ColumnContent,
47
- # ColumnContent("Error Rate", "number", True)])
48
 
49
  # Model information
50
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
30
  ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])