pminervini commited on
Commit
7b5f39c
1 Parent(s): e6de949
blog/figures/clustermap_all.pdf CHANGED
Binary files a/blog/figures/clustermap_all.pdf and b/blog/figures/clustermap_all.pdf differ
 
blog/figures/clustermap_all_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_all_coolwarm.pdf and b/blog/figures/clustermap_all_coolwarm.pdf differ
 
blog/figures/clustermap_all_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_all_viridis.pdf and b/blog/figures/clustermap_all_viridis.pdf differ
 
blog/figures/clustermap_detect.pdf CHANGED
Binary files a/blog/figures/clustermap_detect.pdf and b/blog/figures/clustermap_detect.pdf differ
 
blog/figures/clustermap_detect_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_detect_coolwarm.pdf and b/blog/figures/clustermap_detect_coolwarm.pdf differ
 
blog/figures/clustermap_detect_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_detect_viridis.pdf and b/blog/figures/clustermap_detect_viridis.pdf differ
 
blog/figures/clustermap_instr.pdf CHANGED
Binary files a/blog/figures/clustermap_instr.pdf and b/blog/figures/clustermap_instr.pdf differ
 
blog/figures/clustermap_instr_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_instr_coolwarm.pdf and b/blog/figures/clustermap_instr_coolwarm.pdf differ
 
blog/figures/clustermap_instr_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_instr_viridis.pdf and b/blog/figures/clustermap_instr_viridis.pdf differ
 
blog/figures/clustermap_qa.pdf CHANGED
Binary files a/blog/figures/clustermap_qa.pdf and b/blog/figures/clustermap_qa.pdf differ
 
blog/figures/clustermap_qa_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_qa_coolwarm.pdf and b/blog/figures/clustermap_qa_coolwarm.pdf differ
 
blog/figures/clustermap_qa_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_qa_viridis.pdf and b/blog/figures/clustermap_qa_viridis.pdf differ
 
blog/figures/clustermap_rc.pdf CHANGED
Binary files a/blog/figures/clustermap_rc.pdf and b/blog/figures/clustermap_rc.pdf differ
 
blog/figures/clustermap_rc_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_rc_coolwarm.pdf and b/blog/figures/clustermap_rc_coolwarm.pdf differ
 
blog/figures/clustermap_rc_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_rc_viridis.pdf and b/blog/figures/clustermap_rc_viridis.pdf differ
 
blog/figures/clustermap_summ.pdf CHANGED
Binary files a/blog/figures/clustermap_summ.pdf and b/blog/figures/clustermap_summ.pdf differ
 
blog/figures/clustermap_summ_coolwarm.pdf CHANGED
Binary files a/blog/figures/clustermap_summ_coolwarm.pdf and b/blog/figures/clustermap_summ_coolwarm.pdf differ
 
blog/figures/clustermap_summ_viridis.pdf CHANGED
Binary files a/blog/figures/clustermap_summ_viridis.pdf and b/blog/figures/clustermap_summ_viridis.pdf differ
 
cli/analysis-cli.py CHANGED
@@ -19,6 +19,14 @@ from src.envs import QUEUE_REPO, RESULTS_REPO, API
19
  from src.utils import my_snapshot_download
20
 
21
 
 
 
 
 
 
 
 
 
22
  def find_json_files(json_path):
23
  res = []
24
  for root, dirs, files in os.walk(json_path):
@@ -40,13 +48,16 @@ def sanitise_metric(name: str) -> str:
40
  res = res.replace("exact", "EM")
41
  res = res.replace("HasAns_EM", "HasAns")
42
  res = res.replace("NoAns_EM", "NoAns")
 
43
  return res
44
 
45
 
46
  def sanitise_dataset(name: str) -> str:
47
  res = name
48
- res = res.replace("tqa8", "TriviaQA")
49
- res = res.replace("nq8", "NQ")
 
 
50
  res = res.replace("truthfulqa", "TruthfulQA")
51
  res = res.replace("ifeval", "IFEval")
52
  res = res.replace("selfcheckgpt", "SelfCheckGPT")
@@ -111,12 +122,16 @@ if data_map is None:
111
  for dataset_name, results_dict in data["results"].items():
112
  for metric_name, value in results_dict.items():
113
 
114
- if ',' in metric_name and '_stderr' not in metric_name \
115
- and 'f1' not in metric_name \
116
- and model_name_to_model_map[model_name]["likes"] > 128:
117
 
118
  to_add = True
119
 
 
 
 
 
 
 
120
  if 'memo-trap_v2' in dataset_name:
121
  to_add = False
122
 
@@ -128,9 +143,6 @@ if data_map is None:
128
  if 'faithdial' in dataset_name:
129
  to_add = False
130
 
131
- if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
132
- to_add = False
133
-
134
  if 'truthfulqa_gen' in dataset_name:
135
  to_add = False
136
 
@@ -138,13 +150,9 @@ if data_map is None:
138
  if 'precision' not in metric_name:
139
  to_add = False
140
 
141
- if 'correctness,' in metric_name or 'em,' in metric_name:
142
- to_add = False
143
-
144
- if 'rouge' in metric_name:
145
- pass
146
- # if 'rougeL' not in metric_name:
147
- # to_add = False
148
 
149
  if 'ifeval' in dataset_name:
150
  if 'prompt_level_strict_acc' not in metric_name:
@@ -161,14 +169,23 @@ if data_map is None:
161
  if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
162
  to_add = False
163
 
164
- if 'rouge' in metric_name:
165
- value /= 100.0
166
-
167
- if 'squad' in dataset_name:
168
- value /= 100.0
169
 
170
  if to_add:
171
- sanitised_metric_name = sanitise_metric(metric_name.split(',')[0])
 
 
 
 
 
 
 
 
 
172
  sanitised_dataset_name = sanitise_dataset(dataset_name)
173
 
174
  model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
@@ -207,6 +224,8 @@ for plot_type in plot_type_lst:
207
  to_add = False
208
  if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
209
  to_add = False
 
 
210
  if to_add is True:
211
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
212
  elif plot_type in {'summ'}:
@@ -316,6 +335,6 @@ for plot_type in plot_type_lst:
316
  cmap_suffix = '' if cmap is None else f'_{cmap}'
317
 
318
  # Save the clustermap to file
319
- fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.pdf')
320
- fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.png')
321
- fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
 
19
  from src.utils import my_snapshot_download
20
 
21
 
22
+ def is_float(string):
23
+ try:
24
+ float(string)
25
+ return True
26
+ except ValueError:
27
+ return False
28
+
29
+
30
  def find_json_files(json_path):
31
  res = []
32
  for root, dirs, files in os.walk(json_path):
 
48
  res = res.replace("exact", "EM")
49
  res = res.replace("HasAns_EM", "HasAns")
50
  res = res.replace("NoAns_EM", "NoAns")
51
+ res = res.replace("em", "EM")
52
  return res
53
 
54
 
55
  def sanitise_dataset(name: str) -> str:
56
  res = name
57
+ res = res.replace("tqa8", "TriviaQA (8-shot)")
58
+ res = res.replace("nq8", "NQ (8-shot)")
59
+ res = res.replace("nq_open", "NQ (64-shot)")
60
+ res = res.replace("triviaqa", "TriviaQA (64-shot)")
61
  res = res.replace("truthfulqa", "TruthfulQA")
62
  res = res.replace("ifeval", "IFEval")
63
  res = res.replace("selfcheckgpt", "SelfCheckGPT")
 
122
  for dataset_name, results_dict in data["results"].items():
123
  for metric_name, value in results_dict.items():
124
 
125
+ if model_name_to_model_map[model_name]["likes"] > 128:
 
 
126
 
127
  to_add = True
128
 
129
+ if 'f1' in metric_name:
130
+ to_add = False
131
+
132
+ if 'stderr' in metric_name:
133
+ to_add = False
134
+
135
  if 'memo-trap_v2' in dataset_name:
136
  to_add = False
137
 
 
143
  if 'faithdial' in dataset_name:
144
  to_add = False
145
 
 
 
 
146
  if 'truthfulqa_gen' in dataset_name:
147
  to_add = False
148
 
 
150
  if 'precision' not in metric_name:
151
  to_add = False
152
 
153
+ if 'halueval' in dataset_name:
154
+ if 'acc' not in metric_name:
155
+ to_add = False
 
 
 
 
156
 
157
  if 'ifeval' in dataset_name:
158
  if 'prompt_level_strict_acc' not in metric_name:
 
169
  if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
170
  to_add = False
171
 
172
+ if isinstance(value, str):
173
+ if is_float(value):
174
+ value = float(value)
175
+ else:
176
+ to_add = False
177
 
178
  if to_add:
179
+ if 'rouge' in metric_name:
180
+ value /= 100.0
181
+
182
+ if 'squad' in dataset_name:
183
+ value /= 100.0
184
+
185
+ sanitised_metric_name = metric_name
186
+ if "," in sanitised_metric_name:
187
+ sanitised_metric_name = sanitised_metric_name.split(',')[0]
188
+ sanitised_metric_name = sanitise_metric(sanitised_metric_name)
189
  sanitised_dataset_name = sanitise_dataset(dataset_name)
190
 
191
  model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
 
224
  to_add = False
225
  if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
226
  to_add = False
227
+ if '64-shot' in dataset_metric[0]:
228
+ to_add = False
229
  if to_add is True:
230
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
231
  elif plot_type in {'summ'}:
 
335
  cmap_suffix = '' if cmap is None else f'_{cmap}'
336
 
337
  # Save the clustermap to file
338
+ fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf')
339
+ fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.png')
340
+ fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -21,8 +21,9 @@ class SelfCheckGpt(Task):
21
 
22
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
- self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512} # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
- self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
 
26
  self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
27
 
28
  self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
 
21
 
22
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
+ # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
+ self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
26
+ self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
27
  self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
28
 
29
  self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')