data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Feb 7

Commit

c5558c5

•

1 Parent(s): a654acb

update

Browse files

Files changed (1) hide show

backend-cli.py +77 -14

backend-cli.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import os
 import json
 import random
 from datetime import datetime
@@ -17,6 +18,10 @@ from src.leaderboard.read_evals import EvalResult
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 from src.utils import my_snapshot_download
 import time
 import logging
@@ -124,15 +129,11 @@ def process_finished_requests(thr: int) -> bool:
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
     eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
-    # XXX
-    # eval_requests = [r for r in eval_requests if 'bloom-560m' in r.model]
     random.shuffle(eval_requests)
-    from src.leaderboard.read_evals import get_raw_eval_results
     eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
@@ -143,9 +144,10 @@ def process_finished_requests(thr: int) -> bool:
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
-            from typing import Optional
             eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
@@ -169,6 +171,58 @@ def process_finished_requests(thr: int) -> bool:
     return False
 def process_pending_requests() -> bool:
     sanity_checks()
@@ -176,7 +230,7 @@ def process_pending_requests() -> bool:
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
     eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     random.shuffle(eval_requests)
@@ -207,19 +261,28 @@ def process_pending_requests() -> bool:
 if __name__ == "__main__":
     wait = True
-    import socket
-    if socket.gethostname() in {'hamburg'} or os.path.isdir("/home/pminervi"):
         wait = False
     if wait:
         time.sleep(60 * random.randint(5, 10))
-        pass
-    # res = False
-    res = process_pending_requests()
     if res is False:
-        res = process_finished_requests(100)
     if res is False:
-        res = process_finished_requests(0)

 import os
 import json
+import socket
 import random
 from datetime import datetime
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 from src.utils import my_snapshot_download
+from src.leaderboard.read_evals import get_raw_eval_results
+from typing import Optional
 import time
 import logging
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
     eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted, first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
     random.shuffle(eval_requests)
     eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
             eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
+            breakpoint()
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
     return False
+def maybe_refresh_results(thr: int) -> bool:
+    sanity_checks()
+    current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
+    # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted, first run)
+    eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
+    random.shuffle(eval_requests)
+    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
+    result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
+    result_name_to_result = {r.eval_name: r for r in eval_results}
+    for eval_request in eval_requests:
+        if eval_request.likes >= thr:
+            result_name: str = request_to_result_name(eval_request)
+            # Check the corresponding result
+            eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
+            breakpoint()
+            task_lst = TASKS_HARNESS.copy()
+            random.shuffle(task_lst)
+            # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
+            for task in task_lst:
+                task_name = task.benchmark
+                if (eval_result is None or
+                        task_name not in eval_result.results or
+                        'nq' in task_name or 'trivia' in task_name or 'tqa' in task_name or 'self' in task_name):
+                    eval_request: EvalRequest = result_name_to_request[result_name]
+                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+                    results = process_evaluation(task, eval_request)
+                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+                    return True
+    return False
 def process_pending_requests() -> bool:
     sanity_checks()
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
     eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted, first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     random.shuffle(eval_requests)
 if __name__ == "__main__":
     wait = True
+    if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
         wait = False
     if wait:
         time.sleep(60 * random.randint(5, 10))
+    res = False
+    if random.randint(0, 1) == 0:
+        res = process_pending_requests()
+        time.sleep(60)
     if res is False:
+        if random.randint(0, 1) == 0:
+            res = maybe_refresh_results(100)
+        else:
+            res = process_finished_requests(100)
+    time.sleep(60)
     if res is False:
+        if random.randint(0, 1) == 0:
+            res = maybe_refresh_results(0)
+        else:
+            res = process_finished_requests(0)