pminervini commited on
Commit
3567246
·
2 Parent(s): c639c51 3557858

Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main

Browse files
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -21,9 +21,9 @@ class SelfCheckGpt(Task):
21
 
22
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
- self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
25
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
26
- self.generation_kwargs_sampling = {"temperature": 1.0, "do_sample": False}
27
 
28
  self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
29
  self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
@@ -38,7 +38,7 @@ class SelfCheckGpt(Task):
38
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
39
  self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
40
  self.SelfCheckNLI_error_cnt = 0
41
-
42
  def has_training_docs(self):
43
  return False
44
 
@@ -102,21 +102,21 @@ class SelfCheckGpt(Task):
102
  beta1=0.8, beta2=0.8) # additional params depending on scoring_method
103
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
104
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
105
-
106
  if len(selfcheckgpt_scores) == 0:
107
  self.SelfCheckNLI_error_cnt += 1
108
- print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
109
  result = {
110
- 'avg-selfcheckgpt': 1.0,
111
- 'max-selfcheckgpt': 1.0
112
  }
113
 
114
  else:
115
- threshold = 0.5
116
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
117
- selfcheckgpt_scores_max = 1.0 if max(selfcheckgpt_scores) > threshold else 0.0
118
  # passage is hallucianted if average score of all sentences is hallucinated.
119
- selfcheckgpt_scores_avg = 1.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 0.0
120
  result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
121
  return result
122
 
@@ -139,4 +139,4 @@ class SelfCheckGpt(Task):
139
  A dictionary where keys are the names of submetrics and values are
140
  whether a higher value of the submetric is better
141
  """
142
- return {k: False for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
 
21
 
22
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
+ self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512} # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
26
+ self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
27
 
28
  self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
29
  self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
 
38
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
39
  self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
40
  self.SelfCheckNLI_error_cnt = 0
41
+
42
  def has_training_docs(self):
43
  return False
44
 
 
102
  beta1=0.8, beta2=0.8) # additional params depending on scoring_method
103
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
104
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
105
+
106
  if len(selfcheckgpt_scores) == 0:
107
  self.SelfCheckNLI_error_cnt += 1
108
+ print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 0.0.")
109
  result = {
110
+ 'avg-selfcheckgpt': 0.0,
111
+ 'max-selfcheckgpt': 0.0
112
  }
113
 
114
  else:
115
+ threshold = 0.6 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
116
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
117
+ selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
118
  # passage is hallucianted if average score of all sentences is hallucinated.
119
+ selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
120
  result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
121
  return result
122
 
 
139
  A dictionary where keys are the names of submetrics and values are
140
  whether a higher value of the submetric is better
141
  """
142
+ return {k: True for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}