BenCzechMark-unstable

Runtime error

App Files Files Community

mfajcik commited on Sep 4

Commit

e60cafc

•

1 Parent(s): 8a54af0

Upload compile_log_files.py

Browse files

Files changed (1) hide show

compile_log_files.py +308 -0

compile_log_files.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Author: Martin Fajcik
+import argparse
+import copy
+import glob
+import hashlib
+import os
+import json
+import re
+import jsonlines
+from tqdm import tqdm
+SUPPORTED_METRICS = [
+    "avg_mcauroc",  # for classification tasks
+    "exact_match",  # for QA tasks
+    "acc",  # for multichoice tasks
+    "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
+    "rouge_raw_r2_mid_f",  # for summarization tasks, older metric version for back compatibility
+    "word_perplexity",  # for language modeling tasks
+]
+EXTRA_INFO_RELEASE_KEYS = [
+    'filtered_resps',
+    'doc_id',
+]
+with open("leaderboard/metadata.json", "r") as f:
+    METADATA = json.load(f)
+# TASK MAP
+# from promptname to taskname
+MAP = {
+    'benchmark_agree': 'benczechmark_agree',
+    'benchmark_belebele': 'benczechmark_belebele',
+    'benchmark_czechnews': 'benczechmark_czechnews',
+    'benchmark_subjectivity': 'benczechmark_subjectivity',
+    'benczechmark_snli': 'benczechmark_snli',
+    'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
+    'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
+    'propaganda_nazor': 'benczechmark_propaganda_nazor',
+    'propaganda_strach': 'benczechmark_propaganda_strach',
+    'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
+    'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
+    'propaganda_lokace': 'benczechmark_propaganda_lokace',
+    'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
+    'propaganda_vina': 'benczechmark_propaganda_vina',
+    'propaganda_zanr': 'benczechmark_propaganda_zanr',
+    'propaganda_emoce': 'benczechmark_propaganda_emoce',
+    'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
+    'propaganda_rusko': 'benczechmark_propaganda_rusko',
+    'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
+    'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
+    'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
+    'benczechmark_summarization': 'benczechmark_summarization',
+    'gec': 'benczechmark_grammarerrorcorrection',
+    'cs_nq_open': 'benczechmark_cs_naturalquestions',
+    'cs_sqad_open': 'benczechmark_cs_sqad32',
+    'cs_triviaqa': 'benczechmark_cs_triviaQA',
+    'csfever': 'benczechmark_csfever_nli',
+    'ctkfacts': 'benczechmark_ctkfacts_nli',
+    'cnec_ner': 'benczechmark_cs_ner',
+    'cdec_ner': 'benczechmark_cs_court_decisions_ner',
+    'klokan_qa': 'benczechmark_klokan_qa',
+    'umimeto_biology': 'benczechmark_umimeto_biology',
+    'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
+    'umimeto_czech': 'benczechmark_umimeto_czech',
+    'umimeto_history': 'benczechmark_umimeto_history',
+    'umimeto_informatics': 'benczechmark_umimeto_informatics',
+    'umimeto_math': 'benczechmark_umimeto_math',
+    'umimeto_physics': 'benczechmark_umimeto_physics',
+    'cermat_czech_open': 'benczechmark_cermat_czech_open',
+    'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
+    'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
+    'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
+    'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
+    'history_ir': 'benczechmark_history_ir',
+    'benczechmark_histcorpus': "benczechmark_histcorpus",
+    'benczechmark_hellaswag': "benczechmark_hellaswag",
+    'benczechmark_essay': 'benczechmark_essay',
+    'benczechmark_fiction': 'benczechmark_fiction',
+    'benczechmark_capek': 'benczechmark_capek',
+    'benczechmark_correspondence': 'benczechmark_correspondence',
+    'benczechmark_havlicek': 'benczechmark_havlicek',
+    'benczechmark_speeches': 'benczechmark_speeches',
+    'benczechmark_spoken': 'benczechmark_spoken',
+    'benczechmark_dialect': 'benczechmark_dialect'
+}
+NO_PROMPT_TASKS = ["benczechmark_histcorpus",
+                   "benczechmark_hellaswag",
+                   "benczechmark_essay",
+                   "benczechmark_fiction",
+                   "benczechmark_capek",
+                   "benczechmark_correspondence",
+                   "benczechmark_havlicek",
+                   "benczechmark_speeches",
+                   "benczechmark_spoken",
+                   "benczechmark_dialect"]
+def resolve_taskname(taskname):
+    if taskname not in MAP:
+        raise ValueError(f"Taskname {taskname} not found.")
+    return MAP[taskname]
+def rename_keys(d, resolve_taskname):
+    orig_len = len(d)
+    for k, v in list(d.items()):
+        new_key = resolve_taskname(k)
+        d[new_key] = d.pop(k)
+    # make sure list length didnt changed
+    assert len(d) == orig_len
+def process_harness_logs(input_folders, output_file):
+    """
+    - Selects best prompt for each task
+    - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
+    """
+    def expand_input_folders(input_folders):
+        # Check if input_folders is a wildcard pattern
+        if '*' in input_folders or '?' in input_folders:
+            # Expand the wildcard into a list of matching directories
+            matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
+            return matching_directories
+        else:
+            # If it's not a wildcard, return the input as a single-item list if it's a valid directory
+            if os.path.isdir(input_folders):
+                return [input_folders]
+            else:
+                return []
+    input_folders = expand_input_folders(input_folders)
+    per_task_results = {}
+    metric_per_task = {}
+    predictions = {}
+    all_harness_results = dict()
+    for input_folder in tqdm(input_folders, desc="Loading files"):
+        # read all files in input_folder
+        # consider first folder within this folder
+        input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
+        # find file which starts with results... prefix in the input_folder
+        result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
+        with open(os.path.join(input_folder, result_file), "r") as f:
+            harness_results = json.load(f)
+        all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
+        current_multipleprompt_tasknames = []
+        for name, result in harness_results['results'].items():
+            if name in NO_PROMPT_TASKS:
+                # not prompts
+                taskname = name
+                # process metric names
+                for k, v in copy.deepcopy(result).items():
+                    if "," in k:
+                        name, _ = k.split(",")
+                        del result[k]
+                        result[name] = v
+                per_task_results[taskname] = result
+            if result['alias'].strip().startswith('- prompt-'):
+                # process taskname
+                taskname = name[:-1]
+                if taskname.endswith("_"):
+                    taskname = taskname[:-1]
+                # process metric names
+                for k, v in copy.deepcopy(result).items():
+                    if "," in k:
+                        name, key = k.split(",")
+                        del result[k]
+                        result[name] = v
+                if taskname not in per_task_results:
+                    per_task_results[taskname] = [result]
+                    current_multipleprompt_tasknames.append(taskname)
+                else:
+                    per_task_results[taskname].append(result)
+        # get best result according to metric priority given in SUPPORTED_METRICS list
+        for taskname, results in per_task_results.items():
+            if not taskname in current_multipleprompt_tasknames:
+                continue
+            best_result = None
+            target_metric = None
+            for m in SUPPORTED_METRICS:
+                if m in results[0]:
+                    target_metric = m
+                    break
+            if target_metric is None:
+                raise ValueError(f"No supported metric found in {taskname}")
+            metric_per_task[taskname] = target_metric
+            all_measured_results = []
+            for result in results:
+                all_measured_results.append(result[target_metric])
+                if best_result is None:
+                    best_result = result
+                else:
+                    if result[target_metric] > best_result[target_metric]:
+                        best_result = result
+            # Compute max-centered variance
+            max_value = best_result[target_metric]
+            squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
+            max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
+            best_result['max_centered_variance'] = max_centered_variance
+            per_task_results[taskname] = best_result
+        for file in os.listdir(input_folder):
+            if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
+                continue
+            for taskname in per_task_results.keys():
+                if taskname in file:
+                    print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
+                    # check this file corresponds to same prompt
+                    winning_prompt = per_task_results[taskname]['alias'][-1]
+                    if taskname in NO_PROMPT_TASKS:
+                        current_prompt = "-1"
+                    else:
+                        try:
+                            current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
+                        except AttributeError:
+                            raise ValueError(f"Prompt not found in {file}")
+                    if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
+                        # load file contents
+                        predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
+                        # only keep data necessary for metrics
+                        for prediction in predictions[taskname]:
+                            for key in list(prediction.keys()):
+                                if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
+                                    del prediction[key]
+    # rename keys (tasknames) using resolve_tasknames:
+    rename_keys(predictions, resolve_taskname)
+    rename_keys(per_task_results, resolve_taskname)
+    # assert keys in predictions and results are the same
+    # assert set(predictions.keys()) == set(per_task_results.keys())
+    if not set(predictions.keys()) == set(per_task_results.keys()):
+        # print missing keys
+        print("Missing keys in predictions:")
+        print(set(predictions.keys()) - set(per_task_results.keys()))
+        # print extra keys
+        print("Extra keys in predictions:")
+        print(set(per_task_results.keys()) - set(predictions.keys()))
+        raise ValueError("Keys in predictions and results are not the same")
+    aggregated_predictions = dict()
+    aggregated_predictions["predictions"] = predictions
+    aggregated_predictions["results"] = per_task_results
+    aggregated_predictions["metadata"] = {
+        'git_hash': harness_results['git_hash'],
+        'transformers_version': harness_results['transformers_version'],
+        'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
+        'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
+        'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
+        'eot_token_id': harness_results['eot_token_id'],
+        'max_length': harness_results['max_length'],
+        'task_hashes': harness_results['task_hashes'],
+        'model_source': harness_results['model_source'],
+        'model_name': harness_results['model_name'],
+        'model_name_sanitized': harness_results['model_name_sanitized'],
+        'system_instruction': harness_results['system_instruction'],
+        'system_instruction_sha': harness_results['system_instruction_sha'],
+        'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
+        'chat_template': harness_results['chat_template'],
+        'chat_template_sha': harness_results['chat_template_sha'],
+        'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
+        'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
+    }
+    # make sure all tasks are present
+    all_tasks = set(METADATA["tasks"].keys())
+    all_expected_tasks = set(per_task_results.keys())
+    all_missing_tasks = all_tasks - all_expected_tasks
+    all_extra_tasks = all_expected_tasks - all_tasks
+    if len(all_missing_tasks) > 0:
+        EOLN = "\n"
+        # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
+        raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}")  # TODO: uncomment
+    if len(all_extra_tasks) > 0:
+        EOLN = "\n"
+        raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
+    with open(output_file, "w") as f:
+        json.dump(aggregated_predictions, f)
+    print("Success!")
+    print("Output saved to", output_file)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
+    parser.add_argument("-i", "-f", "--input_folder", "--folder",
+                        help="Folder with unprocessed results from lm harness.", required=True)
+    parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
+    args = parser.parse_args()
+    process_harness_logs(args.input_folder, args.output_file)
+if __name__ == "__main__":
+    main()