pminervini commited on
Commit
c323865
1 Parent(s): 0b755b6
src/backend/tasks/selfcheckgpt/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models
2
+
3
+ In order to run selfcheckgpt evaluation, these dependencies should be installed.
4
+ ```
5
+ pip install spacy
6
+ pip install selfcheckgpt
7
+ python -m spacy download en
8
+ ```
9
+
10
+ selfcheckgpt support different evaluation methods including: `SelfCheckNgram`, `SelfCheckBERTScore`, `SelfCheckMQAG` and `SelfCheckNLI`.
11
+ The default evaluation method in llm-eval-harness is `SelfCheckNgram`. You can change the evaluation method by changing the environment variable
12
+ ```
13
+ export SELFCHECKGPTTYPE=SelfCheckNgram
14
+ ```
15
+ For `SelfCheckBERTScore`, `SelfCheckMQAG` and `SelfCheckNLI` evaluation method which will also run some huggingface models, You can change the running device of the selfcheckgpt to GPU by setting enviroment device:
16
+ ```
17
+ export SELFCHECKGPTDEVICE=cuda
18
+ ```
19
+
20
+ ## Citation
21
+
22
+ ```
23
+ @misc{manakul2023selfcheckgpt,
24
+ title={SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models},
25
+ author={Potsawee Manakul and Adian Liusie and Mark J. F. Gales},
26
+ year={2023},
27
+ eprint={2303.08896},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.CL}
30
+ }
31
+ ```
src/backend/tasks/selfcheckgpt/task.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, List
3
+
4
+
5
+ from lm_eval.api.task import Task
6
+ from lm_eval.api.instance import Instance
7
+ from lm_eval.api.registry import register_task
8
+ from lm_eval.api.metrics import mean
9
+
10
+ import spacy
11
+ from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
12
+
13
+
14
+ @register_task("selfcheckgpt")
15
+ class SelfCheckGpt(Task):
16
+ VERSION = 0.0
17
+ DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
18
+ DATASET_NAME = None
19
+ OUTPUT_TYPE = 'generate_until'
20
+ def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
21
+ super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
22
+ self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
23
+ self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
24
+ self.generation_kwargs_sampling = {"temperature": 1.0, "do_sample": False}
25
+
26
+ self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNgram')
27
+ self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', 'cpu')
28
+ self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
29
+
30
+ if self.selfcheckgpt_type == 'SelfCheckNgram':
31
+ self.selfcheckgpt = SelfCheckNgram(n=1)
32
+ elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
33
+ self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
34
+ elif self.selfcheckgpt_type == 'SelfCheckMQAG':
35
+ self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
36
+ elif self.selfcheckgpt_type == 'SelfCheckNLI':
37
+ self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
38
+
39
+ def has_training_docs(self):
40
+ return False
41
+
42
+ def has_validation_docs(self):
43
+ return True
44
+
45
+ def has_test_docs(self):
46
+ return False
47
+
48
+ def validation_docs(self):
49
+ return self.dataset["evaluation"]
50
+
51
+ def doc_to_text(self, doc):
52
+ doc_text = doc["wiki_bio_text"]
53
+ doc_text = doc_text.split()
54
+ doc_text = " ".join(doc_text[:5])
55
+ doc_text = f"Please generating a Wikipedia passage starting with: {doc_text}\n"
56
+ return doc_text
57
+
58
+ def doc_to_target(self, doc):
59
+ answer = doc['wiki_bio_text']
60
+ return answer
61
+
62
+ def construct_requests(
63
+ self, doc: dict, ctx: str, **kwargs
64
+ ) -> Union[List[Instance], Instance]:
65
+ arguments = (ctx, self.generation_kwargs)
66
+ request_list = [
67
+ Instance(
68
+ request_type=self.OUTPUT_TYPE,
69
+ doc=doc,
70
+ arguments=arguments,
71
+ idx=0,
72
+ **kwargs
73
+ ),
74
+ ]
75
+ sampling_arguments = (ctx, self.generation_kwargs_sampling)
76
+ request_list.extend([
77
+ Instance(
78
+ request_type=self.OUTPUT_TYPE,
79
+ doc=doc,
80
+ arguments=sampling_arguments,
81
+ idx=idx,
82
+ **kwargs
83
+ )
84
+ for idx in range(1, self.generation_kwargs_sampling_number+1)
85
+ ]
86
+ )
87
+ return request_list
88
+
89
+
90
+ def process_results(self, doc, results):
91
+ response_temperature_0 = results[0]
92
+ other_responses = results[1:]
93
+ passage = self.doc_to_target(doc)
94
+
95
+ sentences = self.selfcheckgpt_nlp(response_temperature_0)
96
+ sentences = [sent.text.strip() for sent in sentences.sents]
97
+ if self.selfcheckgpt_type == 'SelfCheckNgram':
98
+ selfcheckgpt_scores = self.selfcheckgpt.predict(
99
+ sentences = sentences,
100
+ passage = response_temperature_0,
101
+ sampled_passages = other_responses,
102
+ )
103
+ return {'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
104
+ 'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']}
105
+
106
+ elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
107
+ selfcheckgpt_scores = self.selfcheckgpt.predict(
108
+ sentences = sentences,
109
+ sampled_passages = other_responses,
110
+ )
111
+ elif self.selfcheckgpt_type == 'SelfCheckMQAG':
112
+ selfcheckgpt_scores = self.selfcheckgpt.predict(
113
+ sentences = sentences,
114
+ sampled_passages = other_responses,
115
+ )
116
+ elif self.selfcheckgpt_type == 'SelfCheckNLI':
117
+ selfcheckgpt_scores = self.selfcheckgpt.predict(
118
+ sentences = sentences,
119
+ passage = response_temperature_0,
120
+ sampled_passages = other_responses,
121
+ num_questions_per_sent = 5, # number of questions to be drawn
122
+ scoring_method = 'bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
123
+ beta1 = 0.8, beta2 = 0.8, # additional params depending on scoring_method
124
+ )
125
+
126
+ selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
127
+ selfcheckgpt_scores_max = max(selfcheckgpt_scores)
128
+
129
+ return {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
130
+
131
+ def aggregation(self):
132
+ """
133
+ :returns: {str: [float] -> float}
134
+ A dictionary where keys are the names of submetrics and values are
135
+ functions that aggregate a list of metrics
136
+ """
137
+ return {k: mean for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
138
+
139
+ def higher_is_better(self):
140
+ """
141
+ :returns: {str: bool}
142
+ A dictionary where keys are the names of submetrics and values are
143
+ whether a higher value of the submetric is better
144
+ """
145
+ return {k: False for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}