data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Dec 2, 2023

Commit

669da77

1 Parent(s): e6299b2

update

Browse files

Files changed (5) hide show

app.py +7 -20
backend-cli.py +97 -28
src/backend/manage_requests.py +1 -0
src/backend/run_eval_suite.py +6 -13
src/leaderboard/read_evals.py +7 -7

app.py CHANGED Viewed

@@ -36,18 +36,16 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
     print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
 except Exception:
     restart_space()
@@ -58,23 +56,12 @@ leaderboard_df = original_df.copy()
 # plot_df = create_plot_df(create_scores_df(raw_data))
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
-def update_table(
-    hidden_df: pd.DataFrame,
-    columns: list,
-    type_query: list,
-    precision_query: str,
-    size_query: list,
-    show_deleted: bool,
-    query: str,
-):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)

 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
     print(EVAL_REQUESTS_PATH)
+    snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
+    snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 except Exception:
     restart_space()
 # plot_df = create_plot_df(create_scores_df(raw_data))
+finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
+def update_table(hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: str, size_query: list,
+                 show_deleted: bool, query: str):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)

backend-cli.py CHANGED Viewed

@@ -8,15 +8,16 @@ from huggingface_hub import snapshot_download
 from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
-from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 import logging
 import pprint
-# TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 logging.getLogger("openai").setLevel(logging.WARNING)
 logging.basicConfig(level=logging.ERROR)
@@ -27,18 +28,102 @@ RUNNING_STATUS = "RUNNING"
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
 snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
     # pull the eval dataset from the hub and parse any eval requests
     # check completed evals and set them to finished
     check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
                           failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
                           hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
     eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
@@ -48,7 +133,7 @@ def run_auto_eval():
     print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
     if len(eval_requests) == 0:
-        return
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
@@ -56,33 +141,17 @@ def run_auto_eval():
     set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
-    #                          batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
-    TASKS_HARNESS = [task.value for task in Tasks]
-    print(f'Device: {DEVICE}')
     for task in TASKS_HARNESS:
-        results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
-                                 batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
-        dumped = json.dumps(results, indent=2)
-        print(dumped)
-        output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        with open(output_path, "w") as f:
-            f.write(dumped)
-        API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-                        repo_id=RESULTS_REPO, repo_type="dataset")
     set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # breakpoint()
 if __name__ == "__main__":
-    run_auto_eval()

 from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
+from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
+from src.backend.manage_requests import EvalRequest
+from src.leaderboard.read_evals import EvalResult
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 import logging
 import pprint
 logging.getLogger("openai").setLevel(logging.WARNING)
 logging.basicConfig(level=logging.ERROR)
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
+TASKS_HARNESS = [task.value for task in Tasks]
 snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+def sanity_checks():
+    print(f'Device: {DEVICE}')
     # pull the eval dataset from the hub and parse any eval requests
     # check completed evals and set them to finished
     check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
                           failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
                           hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
+    return
+def request_to_result_name(request: EvalRequest) -> str:
+    # Request: EvalRequest(model='meta-llama/Llama-2-13b-hf', private=False, status='FINISHED',
+    # json_filepath='./eval-queue-bk/meta-llama/Llama-2-13b-hf_eval_request_False_False_False.json',
+    # weight_type='Original', model_type='pretrained', precision='float32', base_model='', revision='main',
+    # submitted_time='2023-09-09T10:52:17Z', likes=389, params=13.016, license='?')
+    #
+    # EvalResult(eval_name='meta-llama_Llama-2-13b-hf_float32', full_model='meta-llama/Llama-2-13b-hf',
+    # org='meta-llama', model='Llama-2-13b-hf', revision='main',
+    # results={'nq_open': 33.739612188365655, 'triviaqa': 74.12505572893447},
+    # precision=<Precision.float32: ModelDetails(name='float32', symbol='')>,
+    # model_type=<ModelType.PT: ModelDetails(name='pretrained', symbol='🟢')>,
+    # weight_type=<WeightType.Original: ModelDetails(name='Original', symbol='')>,
+    # architecture='LlamaForCausalLM', license='?', likes=389, num_params=13.016, date='2023-09-09T10:52:17Z', still_on_hub=True)
+    #
+    org_and_model = request.model.split("/", 1)
+    if len(org_and_model) == 1:
+        model = org_and_model[0]
+        res = f"{model}_{request.precision}"
+    else:
+        org = org_and_model[0]
+        model = org_and_model[1]
+        res = f"{org}_{model}_{request.precision}"
+    return res
+def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
+    results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
+                             batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+    output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(dumped)
+    API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+                    repo_id=RESULTS_REPO, repo_type="dataset")
+    return results
+def process_finished_requests() -> bool:
+    sanity_checks()
+    current_finished_status = [FINISHED_STATUS]
+    # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted first run)
+    eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
+    from src.leaderboard.read_evals import get_raw_eval_results
+    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
+    result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
+    result_name_to_result = {r.eval_name: r for r in eval_results}
+    for eval_request in eval_requests:
+        result_name: str = request_to_result_name(eval_request)
+        # Check the corresponding result
+        eval_result: EvalResult = result_name_to_result[result_name]
+        # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
+        for task in TASKS_HARNESS:
+            task_name = task.benchmark
+            if task_name not in eval_result.results:
+                results = process_evaluation(task, eval_request)
+                return True
+    return False
+def process_pending_requests() -> bool:
+    sanity_checks()
+    current_pending_status = [PENDING_STATUS]
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
     eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
     if len(eval_requests) == 0:
+        return False
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
     set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
     for task in TASKS_HARNESS:
+        results = process_evaluation(task, eval_request)
     set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    return True
 if __name__ == "__main__":
+    res = process_pending_requests()
+    if res is False:
+        res = process_finished_requests()

src/backend/manage_requests.py CHANGED Viewed

@@ -112,3 +112,4 @@ def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_stat
         else:
             print(f"No result file found for {model} setting it to {failed_status}")
             set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

         else:
             print(f"No result file found for {model} setting it to {failed_status}")
             set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 logging.getLogger("openai").setLevel(logging.WARNING)
-def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None):
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
@@ -14,18 +14,11 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
     print(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(
-        model="hf-causal-experimental",  # "hf-causal"
-        model_args=eval_request.get_model_args(),
-        tasks=task_names,
-        num_fewshot=num_fewshot,
-        batch_size=batch_size,
-        device=device,
-        no_cache=no_cache,
-        limit=limit,
-        write_out=True,
-        output_base_path="logs"
-    )
     results["config"]["model_dtype"] = eval_request.precision
     results["config"]["model_name"] = eval_request.model

 logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None) -> dict:
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
     print(f"Selected Tasks: {task_names}")
+    results = evaluator.simple_evaluate(model="hf-causal-experimental",  # "hf-causal"
+                                        model_args=eval_request.get_model_args(),
+                                        tasks=task_names, num_fewshot=num_fewshot,
+                                        batch_size=batch_size, device=device, no_cache=no_cache,
+                                        limit=limit, write_out=True, output_base_path="logs")
     results["config"]["model_dtype"] = eval_request.precision
     results["config"]["model_name"] = eval_request.model

src/leaderboard/read_evals.py CHANGED Viewed

@@ -31,8 +31,8 @@ class EvalResult:
     date: str = "" # submission date of request file
     still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -93,7 +93,7 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        print(json_filepath, results)
         # XXX
         # if 'nq_open' not in results:
@@ -103,9 +103,9 @@ class EvalResult:
         # if 'triviaqa' not in results:
         #     results['triviaqa'] = 0.0
-        return self(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
-                    precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
-                    architecture=architecture)
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
@@ -210,7 +210,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

     date: str = "" # submission date of request file
     still_on_hub: bool = False
+    @staticmethod
+    def init_from_json_file(json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        # print(json_filepath, results)
         # XXX
         # if 'nq_open' not in results:
         # if 'triviaqa' not in results:
         #     results['triviaqa'] = 0.0
+        return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
+                          precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
+                          architecture=architecture)
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue