File size: 13,670 Bytes
e60cafc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# Author: Martin Fajcik

import argparse
import copy
import glob
import hashlib
import os
import json
import re

import jsonlines
from tqdm import tqdm

SUPPORTED_METRICS = [
    "avg_mcauroc",  # for classification tasks
    "exact_match",  # for QA tasks
    "acc",  # for multichoice tasks
    "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
    "rouge_raw_r2_mid_f",  # for summarization tasks, older metric version for back compatibility
    "word_perplexity",  # for language modeling tasks
]
EXTRA_INFO_RELEASE_KEYS = [
    'filtered_resps',
    'doc_id',
]

with open("leaderboard/metadata.json", "r") as f:
    METADATA = json.load(f)

# TASK MAP
# from promptname to taskname
MAP = {
    'benchmark_agree': 'benczechmark_agree',
    'benchmark_belebele': 'benczechmark_belebele',
    'benchmark_czechnews': 'benczechmark_czechnews',
    'benchmark_subjectivity': 'benczechmark_subjectivity',
    'benczechmark_snli': 'benczechmark_snli',
    'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
    'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
    'propaganda_nazor': 'benczechmark_propaganda_nazor',
    'propaganda_strach': 'benczechmark_propaganda_strach',
    'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
    'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
    'propaganda_lokace': 'benczechmark_propaganda_lokace',
    'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
    'propaganda_vina': 'benczechmark_propaganda_vina',
    'propaganda_zanr': 'benczechmark_propaganda_zanr',
    'propaganda_emoce': 'benczechmark_propaganda_emoce',
    'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
    'propaganda_rusko': 'benczechmark_propaganda_rusko',
    'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
    'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
    'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
    'benczechmark_summarization': 'benczechmark_summarization',
    'gec': 'benczechmark_grammarerrorcorrection',
    'cs_nq_open': 'benczechmark_cs_naturalquestions',
    'cs_sqad_open': 'benczechmark_cs_sqad32',
    'cs_triviaqa': 'benczechmark_cs_triviaQA',
    'csfever': 'benczechmark_csfever_nli',
    'ctkfacts': 'benczechmark_ctkfacts_nli',
    'cnec_ner': 'benczechmark_cs_ner',
    'cdec_ner': 'benczechmark_cs_court_decisions_ner',
    'klokan_qa': 'benczechmark_klokan_qa',
    'umimeto_biology': 'benczechmark_umimeto_biology',
    'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
    'umimeto_czech': 'benczechmark_umimeto_czech',
    'umimeto_history': 'benczechmark_umimeto_history',
    'umimeto_informatics': 'benczechmark_umimeto_informatics',
    'umimeto_math': 'benczechmark_umimeto_math',
    'umimeto_physics': 'benczechmark_umimeto_physics',
    'cermat_czech_open': 'benczechmark_cermat_czech_open',
    'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
    'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
    'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
    'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
    'history_ir': 'benczechmark_history_ir',
    'benczechmark_histcorpus': "benczechmark_histcorpus",
    'benczechmark_hellaswag': "benczechmark_hellaswag",
    'benczechmark_essay': 'benczechmark_essay',
    'benczechmark_fiction': 'benczechmark_fiction',
    'benczechmark_capek': 'benczechmark_capek',
    'benczechmark_correspondence': 'benczechmark_correspondence',
    'benczechmark_havlicek': 'benczechmark_havlicek',
    'benczechmark_speeches': 'benczechmark_speeches',
    'benczechmark_spoken': 'benczechmark_spoken',
    'benczechmark_dialect': 'benczechmark_dialect'
}

NO_PROMPT_TASKS = ["benczechmark_histcorpus",
                   "benczechmark_hellaswag",
                   "benczechmark_essay",
                   "benczechmark_fiction",
                   "benczechmark_capek",
                   "benczechmark_correspondence",
                   "benczechmark_havlicek",
                   "benczechmark_speeches",
                   "benczechmark_spoken",
                   "benczechmark_dialect"]


def resolve_taskname(taskname):
    if taskname not in MAP:
        raise ValueError(f"Taskname {taskname} not found.")
    return MAP[taskname]


def rename_keys(d, resolve_taskname):
    orig_len = len(d)
    for k, v in list(d.items()):
        new_key = resolve_taskname(k)
        d[new_key] = d.pop(k)

    # make sure list length didnt changed
    assert len(d) == orig_len


def process_harness_logs(input_folders, output_file):
    """
    - Selects best prompt for each task
    - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
    """

    def expand_input_folders(input_folders):
        # Check if input_folders is a wildcard pattern
        if '*' in input_folders or '?' in input_folders:
            # Expand the wildcard into a list of matching directories
            matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
            return matching_directories
        else:
            # If it's not a wildcard, return the input as a single-item list if it's a valid directory
            if os.path.isdir(input_folders):
                return [input_folders]
            else:
                return []

    input_folders = expand_input_folders(input_folders)

    per_task_results = {}
    metric_per_task = {}
    predictions = {}

    all_harness_results = dict()
    for input_folder in tqdm(input_folders, desc="Loading files"):
        # read all files in input_folder
        # consider first folder within this folder
        input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
        # find file which starts with results... prefix in the input_folder
        result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
        with open(os.path.join(input_folder, result_file), "r") as f:
            harness_results = json.load(f)
        all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
        current_multipleprompt_tasknames = []
        for name, result in harness_results['results'].items():
            if name in NO_PROMPT_TASKS:
                # not prompts
                taskname = name
                # process metric names
                for k, v in copy.deepcopy(result).items():
                    if "," in k:
                        name, _ = k.split(",")
                        del result[k]
                        result[name] = v
                per_task_results[taskname] = result

            if result['alias'].strip().startswith('- prompt-'):
                # process taskname
                taskname = name[:-1]
                if taskname.endswith("_"):
                    taskname = taskname[:-1]

                # process metric names
                for k, v in copy.deepcopy(result).items():
                    if "," in k:
                        name, key = k.split(",")
                        del result[k]
                        result[name] = v

                if taskname not in per_task_results:
                    per_task_results[taskname] = [result]
                    current_multipleprompt_tasknames.append(taskname)
                else:
                    per_task_results[taskname].append(result)

        # get best result according to metric priority given in SUPPORTED_METRICS list
        for taskname, results in per_task_results.items():
            if not taskname in current_multipleprompt_tasknames:
                continue
            best_result = None
            target_metric = None
            for m in SUPPORTED_METRICS:
                if m in results[0]:
                    target_metric = m
                    break
            if target_metric is None:
                raise ValueError(f"No supported metric found in {taskname}")
            metric_per_task[taskname] = target_metric

            all_measured_results = []
            for result in results:
                all_measured_results.append(result[target_metric])
                if best_result is None:
                    best_result = result
                else:
                    if result[target_metric] > best_result[target_metric]:
                        best_result = result
            # Compute max-centered variance
            max_value = best_result[target_metric]
            squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
            max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
            best_result['max_centered_variance'] = max_centered_variance

            per_task_results[taskname] = best_result

        for file in os.listdir(input_folder):
            if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
                continue
            for taskname in per_task_results.keys():
                if taskname in file:
                    print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
                    # check this file corresponds to same prompt
                    winning_prompt = per_task_results[taskname]['alias'][-1]
                    if taskname in NO_PROMPT_TASKS:
                        current_prompt = "-1"
                    else:
                        try:
                            current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
                        except AttributeError:
                            raise ValueError(f"Prompt not found in {file}")
                    if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
                        # load file contents
                        predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
                        # only keep data necessary for metrics
                        for prediction in predictions[taskname]:
                            for key in list(prediction.keys()):
                                if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
                                    del prediction[key]

    # rename keys (tasknames) using resolve_tasknames:
    rename_keys(predictions, resolve_taskname)
    rename_keys(per_task_results, resolve_taskname)

    # assert keys in predictions and results are the same
    # assert set(predictions.keys()) == set(per_task_results.keys())
    if not set(predictions.keys()) == set(per_task_results.keys()):
        # print missing keys
        print("Missing keys in predictions:")
        print(set(predictions.keys()) - set(per_task_results.keys()))
        # print extra keys
        print("Extra keys in predictions:")
        print(set(per_task_results.keys()) - set(predictions.keys()))
        raise ValueError("Keys in predictions and results are not the same")

    aggregated_predictions = dict()
    aggregated_predictions["predictions"] = predictions
    aggregated_predictions["results"] = per_task_results
    aggregated_predictions["metadata"] = {
        'git_hash': harness_results['git_hash'],
        'transformers_version': harness_results['transformers_version'],
        'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
        'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
        'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
        'eot_token_id': harness_results['eot_token_id'],
        'max_length': harness_results['max_length'],
        'task_hashes': harness_results['task_hashes'],
        'model_source': harness_results['model_source'],
        'model_name': harness_results['model_name'],
        'model_name_sanitized': harness_results['model_name_sanitized'],
        'system_instruction': harness_results['system_instruction'],
        'system_instruction_sha': harness_results['system_instruction_sha'],
        'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
        'chat_template': harness_results['chat_template'],
        'chat_template_sha': harness_results['chat_template_sha'],
        'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
        'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
    }

    # make sure all tasks are present
    all_tasks = set(METADATA["tasks"].keys())
    all_expected_tasks = set(per_task_results.keys())
    all_missing_tasks = all_tasks - all_expected_tasks
    all_extra_tasks = all_expected_tasks - all_tasks
    if len(all_missing_tasks) > 0:
        EOLN = "\n"
        # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
        raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}")  # TODO: uncomment
    if len(all_extra_tasks) > 0:
        EOLN = "\n"
        raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
    with open(output_file, "w") as f:
        json.dump(aggregated_predictions, f)
    print("Success!")
    print("Output saved to", output_file)


def main():
    parser = argparse.ArgumentParser(
        description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
    parser.add_argument("-i", "-f", "--input_folder", "--folder",
                        help="Folder with unprocessed results from lm harness.", required=True)
    parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
    args = parser.parse_args()

    process_harness_logs(args.input_folder, args.output_file)


if __name__ == "__main__":
    main()