Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update selfcheckgpt
Browse filesAfter we discuss with the author of selfcheckgpt Potasawee, we change the prompt of this evaluation for better faithful and factual evaluataion.
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -53,11 +53,15 @@ class SelfCheckGPT(ConfigurableTask):
|
|
53 |
return self.dataset["evaluation"]
|
54 |
|
55 |
def doc_to_text(self, doc):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
61 |
return doc_text
|
62 |
|
63 |
def doc_to_target(self, doc):
|
@@ -104,21 +108,22 @@ class SelfCheckGPT(ConfigurableTask):
|
|
104 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
105 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
106 |
|
107 |
-
if len(selfcheckgpt_scores)
|
|
|
108 |
self.SelfCheckNLI_error_cnt += 1
|
109 |
-
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 0.0.")
|
110 |
result = {
|
111 |
'avg-selfcheckgpt': 0.0,
|
112 |
'max-selfcheckgpt': 0.0
|
113 |
}
|
114 |
|
115 |
else:
|
116 |
-
threshold = 0.
|
117 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
118 |
selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
|
119 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
120 |
selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
|
121 |
result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
|
|
122 |
return result
|
123 |
|
124 |
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|
|
|
53 |
return self.dataset["evaluation"]
|
54 |
|
55 |
def doc_to_text(self, doc):
|
56 |
+
if not hasattr(self, 'selfcheckgpt_nlp'):
|
57 |
+
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
58 |
+
|
59 |
+
sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc['wiki_bio_text']).sents]
|
60 |
+
if len(sentences) < 2:
|
61 |
+
raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
|
62 |
+
# disscussed with Potsawee
|
63 |
+
|
64 |
+
doc_text = f"Please generate a Wikipedia passage that consists of at least two sentences, starting with the following sentence: {sentences[0]}\n"
|
65 |
return doc_text
|
66 |
|
67 |
def doc_to_target(self, doc):
|
|
|
108 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
109 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
110 |
|
111 |
+
if len(selfcheckgpt_scores) < 2:
|
112 |
+
# at least two sentences
|
113 |
self.SelfCheckNLI_error_cnt += 1
|
|
|
114 |
result = {
|
115 |
'avg-selfcheckgpt': 0.0,
|
116 |
'max-selfcheckgpt': 0.0
|
117 |
}
|
118 |
|
119 |
else:
|
120 |
+
threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
|
121 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
122 |
selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
|
123 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
124 |
selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
|
125 |
result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
126 |
+
|
127 |
return result
|
128 |
|
129 |
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|