mfajcik commited on
Commit
e60cafc
1 Parent(s): 8a54af0

Upload compile_log_files.py

Browse files
Files changed (1) hide show
  1. compile_log_files.py +308 -0
compile_log_files.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Martin Fajcik
2
+
3
+ import argparse
4
+ import copy
5
+ import glob
6
+ import hashlib
7
+ import os
8
+ import json
9
+ import re
10
+
11
+ import jsonlines
12
+ from tqdm import tqdm
13
+
14
+ SUPPORTED_METRICS = [
15
+ "avg_mcauroc", # for classification tasks
16
+ "exact_match", # for QA tasks
17
+ "acc", # for multichoice tasks
18
+ "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
19
+ "rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility
20
+ "word_perplexity", # for language modeling tasks
21
+ ]
22
+ EXTRA_INFO_RELEASE_KEYS = [
23
+ 'filtered_resps',
24
+ 'doc_id',
25
+ ]
26
+
27
+ with open("leaderboard/metadata.json", "r") as f:
28
+ METADATA = json.load(f)
29
+
30
+ # TASK MAP
31
+ # from promptname to taskname
32
+ MAP = {
33
+ 'benchmark_agree': 'benczechmark_agree',
34
+ 'benchmark_belebele': 'benczechmark_belebele',
35
+ 'benchmark_czechnews': 'benczechmark_czechnews',
36
+ 'benchmark_subjectivity': 'benczechmark_subjectivity',
37
+ 'benczechmark_snli': 'benczechmark_snli',
38
+ 'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
39
+ 'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
40
+ 'propaganda_nazor': 'benczechmark_propaganda_nazor',
41
+ 'propaganda_strach': 'benczechmark_propaganda_strach',
42
+ 'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
43
+ 'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
44
+ 'propaganda_lokace': 'benczechmark_propaganda_lokace',
45
+ 'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
46
+ 'propaganda_vina': 'benczechmark_propaganda_vina',
47
+ 'propaganda_zanr': 'benczechmark_propaganda_zanr',
48
+ 'propaganda_emoce': 'benczechmark_propaganda_emoce',
49
+ 'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
50
+ 'propaganda_rusko': 'benczechmark_propaganda_rusko',
51
+ 'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
52
+ 'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
53
+ 'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
54
+ 'benczechmark_summarization': 'benczechmark_summarization',
55
+ 'gec': 'benczechmark_grammarerrorcorrection',
56
+ 'cs_nq_open': 'benczechmark_cs_naturalquestions',
57
+ 'cs_sqad_open': 'benczechmark_cs_sqad32',
58
+ 'cs_triviaqa': 'benczechmark_cs_triviaQA',
59
+ 'csfever': 'benczechmark_csfever_nli',
60
+ 'ctkfacts': 'benczechmark_ctkfacts_nli',
61
+ 'cnec_ner': 'benczechmark_cs_ner',
62
+ 'cdec_ner': 'benczechmark_cs_court_decisions_ner',
63
+ 'klokan_qa': 'benczechmark_klokan_qa',
64
+ 'umimeto_biology': 'benczechmark_umimeto_biology',
65
+ 'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
66
+ 'umimeto_czech': 'benczechmark_umimeto_czech',
67
+ 'umimeto_history': 'benczechmark_umimeto_history',
68
+ 'umimeto_informatics': 'benczechmark_umimeto_informatics',
69
+ 'umimeto_math': 'benczechmark_umimeto_math',
70
+ 'umimeto_physics': 'benczechmark_umimeto_physics',
71
+ 'cermat_czech_open': 'benczechmark_cermat_czech_open',
72
+ 'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
73
+ 'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
74
+ 'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
75
+ 'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
76
+ 'history_ir': 'benczechmark_history_ir',
77
+ 'benczechmark_histcorpus': "benczechmark_histcorpus",
78
+ 'benczechmark_hellaswag': "benczechmark_hellaswag",
79
+ 'benczechmark_essay': 'benczechmark_essay',
80
+ 'benczechmark_fiction': 'benczechmark_fiction',
81
+ 'benczechmark_capek': 'benczechmark_capek',
82
+ 'benczechmark_correspondence': 'benczechmark_correspondence',
83
+ 'benczechmark_havlicek': 'benczechmark_havlicek',
84
+ 'benczechmark_speeches': 'benczechmark_speeches',
85
+ 'benczechmark_spoken': 'benczechmark_spoken',
86
+ 'benczechmark_dialect': 'benczechmark_dialect'
87
+ }
88
+
89
+ NO_PROMPT_TASKS = ["benczechmark_histcorpus",
90
+ "benczechmark_hellaswag",
91
+ "benczechmark_essay",
92
+ "benczechmark_fiction",
93
+ "benczechmark_capek",
94
+ "benczechmark_correspondence",
95
+ "benczechmark_havlicek",
96
+ "benczechmark_speeches",
97
+ "benczechmark_spoken",
98
+ "benczechmark_dialect"]
99
+
100
+
101
+ def resolve_taskname(taskname):
102
+ if taskname not in MAP:
103
+ raise ValueError(f"Taskname {taskname} not found.")
104
+ return MAP[taskname]
105
+
106
+
107
+ def rename_keys(d, resolve_taskname):
108
+ orig_len = len(d)
109
+ for k, v in list(d.items()):
110
+ new_key = resolve_taskname(k)
111
+ d[new_key] = d.pop(k)
112
+
113
+ # make sure list length didnt changed
114
+ assert len(d) == orig_len
115
+
116
+
117
+ def process_harness_logs(input_folders, output_file):
118
+ """
119
+ - Selects best prompt for each task
120
+ - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
121
+ """
122
+
123
+ def expand_input_folders(input_folders):
124
+ # Check if input_folders is a wildcard pattern
125
+ if '*' in input_folders or '?' in input_folders:
126
+ # Expand the wildcard into a list of matching directories
127
+ matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
128
+ return matching_directories
129
+ else:
130
+ # If it's not a wildcard, return the input as a single-item list if it's a valid directory
131
+ if os.path.isdir(input_folders):
132
+ return [input_folders]
133
+ else:
134
+ return []
135
+
136
+ input_folders = expand_input_folders(input_folders)
137
+
138
+ per_task_results = {}
139
+ metric_per_task = {}
140
+ predictions = {}
141
+
142
+ all_harness_results = dict()
143
+ for input_folder in tqdm(input_folders, desc="Loading files"):
144
+ # read all files in input_folder
145
+ # consider first folder within this folder
146
+ input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
147
+ # find file which starts with results... prefix in the input_folder
148
+ result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
149
+ with open(os.path.join(input_folder, result_file), "r") as f:
150
+ harness_results = json.load(f)
151
+ all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
152
+ current_multipleprompt_tasknames = []
153
+ for name, result in harness_results['results'].items():
154
+ if name in NO_PROMPT_TASKS:
155
+ # not prompts
156
+ taskname = name
157
+ # process metric names
158
+ for k, v in copy.deepcopy(result).items():
159
+ if "," in k:
160
+ name, _ = k.split(",")
161
+ del result[k]
162
+ result[name] = v
163
+ per_task_results[taskname] = result
164
+
165
+ if result['alias'].strip().startswith('- prompt-'):
166
+ # process taskname
167
+ taskname = name[:-1]
168
+ if taskname.endswith("_"):
169
+ taskname = taskname[:-1]
170
+
171
+ # process metric names
172
+ for k, v in copy.deepcopy(result).items():
173
+ if "," in k:
174
+ name, key = k.split(",")
175
+ del result[k]
176
+ result[name] = v
177
+
178
+ if taskname not in per_task_results:
179
+ per_task_results[taskname] = [result]
180
+ current_multipleprompt_tasknames.append(taskname)
181
+ else:
182
+ per_task_results[taskname].append(result)
183
+
184
+ # get best result according to metric priority given in SUPPORTED_METRICS list
185
+ for taskname, results in per_task_results.items():
186
+ if not taskname in current_multipleprompt_tasknames:
187
+ continue
188
+ best_result = None
189
+ target_metric = None
190
+ for m in SUPPORTED_METRICS:
191
+ if m in results[0]:
192
+ target_metric = m
193
+ break
194
+ if target_metric is None:
195
+ raise ValueError(f"No supported metric found in {taskname}")
196
+ metric_per_task[taskname] = target_metric
197
+
198
+ all_measured_results = []
199
+ for result in results:
200
+ all_measured_results.append(result[target_metric])
201
+ if best_result is None:
202
+ best_result = result
203
+ else:
204
+ if result[target_metric] > best_result[target_metric]:
205
+ best_result = result
206
+ # Compute max-centered variance
207
+ max_value = best_result[target_metric]
208
+ squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
209
+ max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
210
+ best_result['max_centered_variance'] = max_centered_variance
211
+
212
+ per_task_results[taskname] = best_result
213
+
214
+ for file in os.listdir(input_folder):
215
+ if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
216
+ continue
217
+ for taskname in per_task_results.keys():
218
+ if taskname in file:
219
+ print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
220
+ # check this file corresponds to same prompt
221
+ winning_prompt = per_task_results[taskname]['alias'][-1]
222
+ if taskname in NO_PROMPT_TASKS:
223
+ current_prompt = "-1"
224
+ else:
225
+ try:
226
+ current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
227
+ except AttributeError:
228
+ raise ValueError(f"Prompt not found in {file}")
229
+ if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
230
+ # load file contents
231
+ predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
232
+ # only keep data necessary for metrics
233
+ for prediction in predictions[taskname]:
234
+ for key in list(prediction.keys()):
235
+ if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
236
+ del prediction[key]
237
+
238
+ # rename keys (tasknames) using resolve_tasknames:
239
+ rename_keys(predictions, resolve_taskname)
240
+ rename_keys(per_task_results, resolve_taskname)
241
+
242
+ # assert keys in predictions and results are the same
243
+ # assert set(predictions.keys()) == set(per_task_results.keys())
244
+ if not set(predictions.keys()) == set(per_task_results.keys()):
245
+ # print missing keys
246
+ print("Missing keys in predictions:")
247
+ print(set(predictions.keys()) - set(per_task_results.keys()))
248
+ # print extra keys
249
+ print("Extra keys in predictions:")
250
+ print(set(per_task_results.keys()) - set(predictions.keys()))
251
+ raise ValueError("Keys in predictions and results are not the same")
252
+
253
+ aggregated_predictions = dict()
254
+ aggregated_predictions["predictions"] = predictions
255
+ aggregated_predictions["results"] = per_task_results
256
+ aggregated_predictions["metadata"] = {
257
+ 'git_hash': harness_results['git_hash'],
258
+ 'transformers_version': harness_results['transformers_version'],
259
+ 'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
260
+ 'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
261
+ 'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
262
+ 'eot_token_id': harness_results['eot_token_id'],
263
+ 'max_length': harness_results['max_length'],
264
+ 'task_hashes': harness_results['task_hashes'],
265
+ 'model_source': harness_results['model_source'],
266
+ 'model_name': harness_results['model_name'],
267
+ 'model_name_sanitized': harness_results['model_name_sanitized'],
268
+ 'system_instruction': harness_results['system_instruction'],
269
+ 'system_instruction_sha': harness_results['system_instruction_sha'],
270
+ 'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
271
+ 'chat_template': harness_results['chat_template'],
272
+ 'chat_template_sha': harness_results['chat_template_sha'],
273
+ 'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
274
+ 'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
275
+ }
276
+
277
+ # make sure all tasks are present
278
+ all_tasks = set(METADATA["tasks"].keys())
279
+ all_expected_tasks = set(per_task_results.keys())
280
+ all_missing_tasks = all_tasks - all_expected_tasks
281
+ all_extra_tasks = all_expected_tasks - all_tasks
282
+ if len(all_missing_tasks) > 0:
283
+ EOLN = "\n"
284
+ # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
285
+ raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment
286
+ if len(all_extra_tasks) > 0:
287
+ EOLN = "\n"
288
+ raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
289
+ with open(output_file, "w") as f:
290
+ json.dump(aggregated_predictions, f)
291
+ print("Success!")
292
+ print("Output saved to", output_file)
293
+
294
+
295
+ def main():
296
+ parser = argparse.ArgumentParser(
297
+ description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
298
+ parser.add_argument("-i", "-f", "--input_folder", "--folder",
299
+ help="Folder with unprocessed results from lm harness.", required=True)
300
+ parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
301
+ args = parser.parse_args()
302
+
303
+ process_harness_logs(args.input_folder, args.output_file)
304
+
305
+
306
+ if __name__ == "__main__":
307
+ main()
308
+