data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Dec 11, 2023

Commit

40ac231

1 Parent(s): 4e10b3e

update

Browse files

Files changed (4) hide show

fix-requests-cli.py +1 -1
src/display/utils.py +1 -0
src/leaderboard/read_evals.py +7 -77
submit-cli.py +31 -17

fix-requests-cli.py CHANGED Viewed

@@ -41,7 +41,7 @@ for path in json_files:
                 data["model_type"] = "fine-tuned"
                 to_overwrite = True
-            is_instruction_tuned = 'instruct' in model_id
             if is_instruction_tuned:
                 data["model_type"] = "instruction-tuned"
                 to_overwrite = True

                 data["model_type"] = "fine-tuned"
                 to_overwrite = True
+            is_instruction_tuned = 'nstruct' in model_id
             if is_instruction_tuned:
                 data["model_type"] = "instruction-tuned"
                 to_overwrite = True

src/display/utils.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Tasks(Enum):
     triviaqa = Task("triviaqa", "em", "TriviaQA")
     truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
     truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
     #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
     #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")

     triviaqa = Task("triviaqa", "em", "TriviaQA")
     truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
     truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
+    halueval_qa = Task("halueval_qa", "em", "HaluEval QA")
     #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
     #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
@@ -32,7 +31,7 @@ class EvalResult:
     still_on_hub: bool = False
     @staticmethod
-    def init_from_json_file(json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -67,79 +66,13 @@ class EvalResult:
         # Extract results available in this file (some results are split in several files)
         results = {}
-        for task in Tasks:
-            task = task.value
-            def post_process_results(results: dict) -> dict:
-                # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
-                res_copy = results.copy()
-                for task_name in res_copy.keys():
-                    entry_copy = results[task_name].copy()
-                    for k, v in entry_copy.items():
-                        if "exact_match" in k:
-                            results[task_name][k.replace("exact_match", "em")] = v
-                    entry_copy = results[task_name].copy()
-                    for k, v in entry_copy.items():
-                        if "," in k:
-                            tokens = k.split(",")
-                            results[task_name][tokens[0]] = v
-                return results
-            accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
-                          precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
-                          architecture=architecture)
-    @staticmethod
-    def init_from_json_file_backend(json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        # We manage the legacy config format
-        config = data.get("config", data.get("config_general", None))
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, error, model_config = \
-            is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        from src.backend.envs import Tasks as BackendTasks
-        for task in BackendTasks:
             task = task.value
             def post_process_results(results: dict) -> dict:
@@ -267,10 +200,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
-        if is_backend:
-            eval_result = EvalResult.init_from_json_file_backend(model_result_filepath)
-        else:
-            eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together

 import glob
 import json
 import os
 from dataclasses import dataclass
     still_on_hub: bool = False
     @staticmethod
+    def init_from_json_file(json_filepath, is_backend: bool = False):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         # Extract results available in this file (some results are split in several files)
         results = {}
+        task_iterator = Tasks
+        if is_backend is True:
+            from src.backend.envs import Tasks as BackendTasks
+            task_iterator = BackendTasks
+        for task in task_iterator:
             task = task.value
             def post_process_results(results: dict) -> dict:
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together

submit-cli.py CHANGED Viewed

@@ -2,12 +2,18 @@
 import json
 import os
 from datetime import datetime, timezone
 from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
 from src.submission.check_validity import already_submitted_models, get_model_size, is_model_on_hub
 def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
     REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
@@ -118,32 +124,40 @@ def main():
     filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
-    for i in range(min(200, len(filtered_model_lst))):
-        model = filtered_model_lst[i]
-        print(f'Considering {model.id} ..')
-        from huggingface_hub import snapshot_download
-        from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
-        from src.backend.manage_requests import get_eval_requests
-        from src.backend.manage_requests import EvalRequest
-        snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-        PENDING_STATUS = "PENDING"
-        RUNNING_STATUS = "RUNNING"
-        FINISHED_STATUS = "FINISHED"
-        FAILED_STATUS = "FAILED"
-        status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
-        # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-        eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-        requested_model_names = {e.model for e in eval_requests}
         if model.id not in requested_model_names:
-            add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type='pretrained')
         else:
             print(f'Model {model.id} already added, not adding it to the queue again.')

 import json
 import os
+import time
 from datetime import datetime, timezone
 from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
 from src.submission.check_validity import already_submitted_models, get_model_size, is_model_on_hub
+from huggingface_hub import snapshot_download
+from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
+from src.backend.manage_requests import get_eval_requests
+from src.backend.manage_requests import EvalRequest
 def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
     REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+    PENDING_STATUS = "PENDING"
+    RUNNING_STATUS = "RUNNING"
+    FINISHED_STATUS = "FINISHED"
+    FAILED_STATUS = "FAILED"
+    status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
+    # Get all eval requests
+    eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    requested_model_names = {e.model for e in eval_requests}
+    for i in range(min(200, len(filtered_model_lst))):
+        model = filtered_model_lst[i]
+        print(f'Considering {model.id} ..')
+        is_finetuned = any(tag.startswith('base_model:') for tag in model.tags)
+        model_type = 'pretrained'
+        if is_finetuned:
+            model_type = "fine-tuned"
+        is_instruction_tuned = 'nstruct' in model.id
+        if is_instruction_tuned:
+            model_type = "instruction-tuned"
         if model.id not in requested_model_names:
+            if 'mage' not in model.id:
+                add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
+                time.sleep(60)
         else:
             print(f'Model {model.id} already added, not adding it to the queue again.')