pingnieuk commited on
Commit
d374577
1 Parent(s): e7827c3

Update selfcheckgpt

Browse files

After we discuss with the author of selfcheckgpt Potasawee, we change the prompt of this evaluation for better faithful and factual evaluataion.

src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -53,11 +53,15 @@ class SelfCheckGPT(ConfigurableTask):
53
  return self.dataset["evaluation"]
54
 
55
  def doc_to_text(self, doc):
56
- doc_text = doc["wiki_bio_text"]
57
- doc_text = doc_text.split()
58
- doc_text = " ".join(doc_text[:5])
59
- # prompt = f"This is a passage from Wikipedia about {context}:\n\n"
60
- doc_text = f"Please generate a Wikipedia passage starting with: {doc_text}\n"
 
 
 
 
61
  return doc_text
62
 
63
  def doc_to_target(self, doc):
@@ -104,21 +108,22 @@ class SelfCheckGPT(ConfigurableTask):
104
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
105
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
106
 
107
- if len(selfcheckgpt_scores) == 0:
 
108
  self.SelfCheckNLI_error_cnt += 1
109
- print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 0.0.")
110
  result = {
111
  'avg-selfcheckgpt': 0.0,
112
  'max-selfcheckgpt': 0.0
113
  }
114
 
115
  else:
116
- threshold = 0.6 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
117
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
118
  selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
119
  # passage is hallucianted if average score of all sentences is hallucinated.
120
  selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
121
  result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
 
122
  return result
123
 
124
  selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
 
53
  return self.dataset["evaluation"]
54
 
55
  def doc_to_text(self, doc):
56
+ if not hasattr(self, 'selfcheckgpt_nlp'):
57
+ self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
58
+
59
+ sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc['wiki_bio_text']).sents]
60
+ if len(sentences) < 2:
61
+ raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
62
+ # disscussed with Potsawee
63
+
64
+ doc_text = f"Please generate a Wikipedia passage that consists of at least two sentences, starting with the following sentence: {sentences[0]}\n"
65
  return doc_text
66
 
67
  def doc_to_target(self, doc):
 
108
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
109
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
110
 
111
+ if len(selfcheckgpt_scores) < 2:
112
+ # at least two sentences
113
  self.SelfCheckNLI_error_cnt += 1
 
114
  result = {
115
  'avg-selfcheckgpt': 0.0,
116
  'max-selfcheckgpt': 0.0
117
  }
118
 
119
  else:
120
+ threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
121
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
122
  selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
123
  # passage is hallucianted if average score of all sentences is hallucinated.
124
  selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
125
  result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
126
+
127
  return result
128
 
129
  selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0