Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

App Files Files Community

pminervini commited on Jan 3, 2024

Commit

db18e72

2 Parent(s): 942ae30 17800c0

Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main

Browse files

Files changed (1) hide show

src/backend/tasks/selfcheckgpt/task.py +16 -1

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -36,7 +36,8 @@ class SelfCheckGpt(Task):
             self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
     def has_training_docs(self):
         return False
@@ -105,6 +106,20 @@ class SelfCheckGpt(Task):
                 sentences = sentences,
                 sampled_passages = other_responses,
                 )
         selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
         selfcheckgpt_scores_max = max(selfcheckgpt_scores)

             self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
+        self.SelfCheckNLI_error_cnt = 0
     def has_training_docs(self):
         return False
                 sentences = sentences,
                 sampled_passages = other_responses,
                 )
+            if len(selfcheckgpt_scores) == 0:
+                self.SelfCheckNLI_error_cnt += 1
+                print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
+                result = {'avg-selfcheckgpt': 1.0, 'max-selfcheckgpt': 1.0}
+            else:
+                threshold = 0.5
+                # passage is hallucianted if one sentence is hallucinated. It's very strict.
+                selfcheckgpt_scores_max = 1.0 if max(selfcheckgpt_scores) > threshold else 0.0
+                # passage is hallucianted if average score of all sentences is hallucinated.
+                selfcheckgpt_scores_avg = 1.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 0.0
+                result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
+            return result
         selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
         selfcheckgpt_scores_max = max(selfcheckgpt_scores)