Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

future-xy commited on Mar 19, 2024

Commit

d6d7ec6

1 Parent(s): 2d754ab

formatting code

Browse files

Files changed (36) hide show

app.py +75 -56
backend-cli.py +161 -44
cli/analysis-cli.py +93 -76
cli/averitec-upload-cli.py +8 -6
cli/beta-cli.py +12 -6
cli/completed-cli.py +20 -8
cli/eval-cli.py +18 -8
cli/fever-upload-cli.py +15 -18
cli/fix-requests-cli.py +8 -8
cli/halueval-upload-cli.py +23 -23
cli/isp-upload-cli.py +4 -4
cli/nqswap-upload-cli.py +2 -6
cli/shroom-upload-cli.py +3 -3
cli/submit-cli.py +39 -16
cli/sync-open-llm-cli.py +29 -11
cli/truefalse-upload-cli.py +3 -3
src/backend/envs.py +1 -1
src/backend/huggingface_generate_until.py +4 -5
src/backend/manage_requests.py +37 -20
src/backend/moe_infinity.py +12 -9
src/backend/run_eval_suite.py +30 -16
src/backend/tasks/cnndm/task.py +42 -18
src/backend/tasks/cnndm/task_v2.py +47 -19
src/backend/tasks/faithdial/utils.py +3 -2
src/backend/tasks/halueval/utils.py +21 -3
src/backend/tasks/selfcheckgpt/task.py +48 -36
src/backend/tasks/xsum/task.py +51 -10
src/backend/tasks/xsum/task_v2.py +56 -11
src/browse.py +65 -59
src/display/utils.py +1 -0
src/leaderboard/filter_models.py +3 -3
src/leaderboard/read_evals.py +34 -23
src/populate.py +11 -7
src/submission/check_validity.py +22 -6
src/submission/submit.py +3 -1
src/utils.py +8 -5

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from src.display.about import (
     LLM_BENCHMARKS_TEXT,
     LLM_BENCHMARKS_DETAILS,
     FAQ_TEXT,
-    TITLE
 )
 from src.display.css_html_js import custom_css
@@ -35,7 +35,7 @@ from src.display.utils import (
     ModelType,
     fields,
     WeightType,
-    Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
@@ -47,7 +47,9 @@ from src.utils import get_dataset_summary_table
 def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
     try:
         print(local_dir)
-        snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout)
     except Exception as e:
         restart_space()
@@ -57,15 +59,21 @@ def restart_space():
 def init_space():
-    dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
-    if socket.gethostname() not in {'neuromancer'}:
         # sync model_type with open-llm-leaderboard
-        ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
-        ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
     raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
-    finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
@@ -74,12 +82,9 @@ leaderboard_df = original_df.copy()
 # Searching and filtering
-def update_table(hidden_df: pd.DataFrame,
-                 columns: list,
-                 type_query: list,
-                 precision_query: list,
-                 size_query: list,
-                 query: str):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
@@ -99,7 +104,9 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     # We use COLS to maintain sorting
     filtered_df = df[
         # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
     ]
     return filtered_df
@@ -121,10 +128,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
     return filtered_df
-def filter_models(df: pd.DataFrame,
-                  type_query: list,
-                  size_query: list,
-                  precision_query: list) -> pd.DataFrame:
     # Show all models
     filtered_df = df
@@ -152,15 +156,15 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark",
-                        elem_id="llm-benchmark-tab-table",
-                        id=0):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        search_bar = gr.Textbox(placeholder=" 🔍 Model search (separate multiple queries with `;`)",
-                                                show_label=False,
-                                                elem_id="search-bar")
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
                             choices=[
@@ -175,7 +179,8 @@ with demo:
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
-                            interactive=True)
                 with gr.Column(min_width=320):
                     filter_columns_type = gr.CheckboxGroup(
@@ -183,40 +188,51 @@ with demo:
                         choices=[t.to_str() for t in ModelType],
                         value=[t.to_str() for t in ModelType],
                         interactive=True,
-                        elem_id="filter-columns-type")
                     filter_columns_precision = gr.CheckboxGroup(
                         label="Precision",
                         choices=[i.value.name for i in Precision],
                         value=[i.value.name for i in Precision],
                         interactive=True,
-                        elem_id="filter-columns-precision")
                     filter_columns_size = gr.CheckboxGroup(
                         label="Model sizes (in billions of parameters)",
                         choices=list(NUMERIC_INTERVALS.keys()),
                         value=list(NUMERIC_INTERVALS.keys()),
                         interactive=True,
-                        elem_id="filter-columns-size")
             # breakpoint()
             leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name]
-                ] if leaderboard_df.empty is False else leaderboard_df,
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
-                visible=True)  # column_widths=["2%", "20%"]
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
                 value=original_df[COLS] if original_df.empty is False else original_df,
                 headers=COLS,
                 datatype=TYPES,
-                visible=False)
             search_bar.submit(
                 update_table,
@@ -228,7 +244,8 @@ with demo:
                     filter_columns_size,
                     search_bar,
                 ],
-                leaderboard_table)
             # Check query parameter once at startup and update search bar
             demo.load(load_query, inputs=[], outputs=[search_bar])
@@ -245,7 +262,8 @@ with demo:
                         search_bar,
                     ],
                     leaderboard_table,
-                    queue=True)
         with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -253,11 +271,12 @@ with demo:
             dataset_table = gr.components.Dataframe(
                 value=dataset_df,
                 headers=list(dataset_df.columns),
-                datatype=['str', 'markdown', 'str', 'str', 'str'],
                 elem_id="dataset-table",
                 interactive=False,
                 visible=True,
-                column_widths=["15%", "20%"])
             gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
@@ -271,26 +290,20 @@ with demo:
                     with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5)
                     with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5)
                     with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5)
             with gr.Row():
                 gr.Markdown("# Submit your model here", elem_classes="markdown-text")
@@ -305,7 +318,8 @@ with demo:
                         label="Model type",
                         multiselect=False,
                         value=None,
-                        interactive=True)
                 with gr.Column():
                     precision = gr.Dropdown(
@@ -313,14 +327,16 @@ with demo:
                         label="Precision",
                         multiselect=False,
                         value="float32",
-                        interactive=True)
                     weight_type = gr.Dropdown(
                         choices=[i.value.name for i in WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
-                        interactive=True)
                     base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
@@ -337,7 +353,8 @@ with demo:
                     weight_type,
                     model_type,
                 ],
-                submission_result)
     with gr.Row():
         with gr.Accordion("Citing this leaderboard", open=False):
@@ -346,7 +363,8 @@ with demo:
                 label=CITATION_BUTTON_LABEL,
                 lines=20,
                 elem_id="citation-button",
-                show_copy_button=True)
 scheduler = BackgroundScheduler()
@@ -356,7 +374,8 @@ scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
 def launch_backend():
     import subprocess
     from src.backend.envs import DEVICE
-    if DEVICE not in {'cpu'}:
         _ = subprocess.run(["python", "backend-cli.py"])

     LLM_BENCHMARKS_TEXT,
     LLM_BENCHMARKS_DETAILS,
     FAQ_TEXT,
+    TITLE,
 )
 from src.display.css_html_js import custom_css
     ModelType,
     fields,
     WeightType,
+    Precision,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
     try:
         print(local_dir)
+        snapshot_download(
+            repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout
+        )
     except Exception as e:
         restart_space()
 def init_space():
+    dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
+    if socket.gethostname() not in {"neuromancer"}:
         # sync model_type with open-llm-leaderboard
+        ui_snapshot_download(
+            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+        )
+        ui_snapshot_download(
+            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+        )
     raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
+    finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
+        EVAL_REQUESTS_PATH, EVAL_COLS
+    )
     return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 # Searching and filtering
+def update_table(
+    hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
+):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     # We use COLS to maintain sorting
     filtered_df = df[
         # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
+        always_here_cols
+        + [c for c in COLS if c in df.columns and c in columns]
+        + dummy_col
     ]
     return filtered_df
     return filtered_df
+def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list) -> pd.DataFrame:
     # Show all models
     filtered_df = df
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
+                        search_bar = gr.Textbox(
+                            placeholder=" 🔍 Model search (separate multiple queries with `;`)",
+                            show_label=False,
+                            elem_id="search-bar",
+                        )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
                             choices=[
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
+                            interactive=True,
+                        )
                 with gr.Column(min_width=320):
                     filter_columns_type = gr.CheckboxGroup(
                         choices=[t.to_str() for t in ModelType],
                         value=[t.to_str() for t in ModelType],
                         interactive=True,
+                        elem_id="filter-columns-type",
+                    )
                     filter_columns_precision = gr.CheckboxGroup(
                         label="Precision",
                         choices=[i.value.name for i in Precision],
                         value=[i.value.name for i in Precision],
                         interactive=True,
+                        elem_id="filter-columns-precision",
+                    )
                     filter_columns_size = gr.CheckboxGroup(
                         label="Model sizes (in billions of parameters)",
                         choices=list(NUMERIC_INTERVALS.keys()),
                         value=list(NUMERIC_INTERVALS.keys()),
                         interactive=True,
+                        elem_id="filter-columns-size",
+                    )
             # breakpoint()
             leaderboard_table = gr.components.Dataframe(
+                value=(
+                    leaderboard_df[
+                        [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                        + shown_columns.value
+                        + [AutoEvalColumn.dummy.name]
+                    ]
+                    if leaderboard_df.empty is False
+                    else leaderboard_df
+                ),
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
+                visible=True,
+            )  # column_widths=["2%", "20%"]
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
                 value=original_df[COLS] if original_df.empty is False else original_df,
                 headers=COLS,
                 datatype=TYPES,
+                visible=False,
+            )
             search_bar.submit(
                 update_table,
                     filter_columns_size,
                     search_bar,
                 ],
+                leaderboard_table,
+            )
             # Check query parameter once at startup and update search bar
             demo.load(load_query, inputs=[], outputs=[search_bar])
                         search_bar,
                     ],
                     leaderboard_table,
+                    queue=True,
+                )
         with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             dataset_table = gr.components.Dataframe(
                 value=dataset_df,
                 headers=list(dataset_df.columns),
+                datatype=["str", "markdown", "str", "str", "str"],
                 elem_id="dataset-table",
                 interactive=False,
                 visible=True,
+                column_widths=["15%", "20%"],
+            )
             gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
                     with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
+                            )
                     with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
+                            )
                     with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
+                            )
             with gr.Row():
                 gr.Markdown("# Submit your model here", elem_classes="markdown-text")
                         label="Model type",
                         multiselect=False,
                         value=None,
+                        interactive=True,
+                    )
                 with gr.Column():
                     precision = gr.Dropdown(
                         label="Precision",
                         multiselect=False,
                         value="float32",
+                        interactive=True,
+                    )
                     weight_type = gr.Dropdown(
                         choices=[i.value.name for i in WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",
+                        interactive=True,
+                    )
                     base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
                     weight_type,
                     model_type,
                 ],
+                submission_result,
+            )
     with gr.Row():
         with gr.Accordion("Citing this leaderboard", open=False):
                 label=CITATION_BUTTON_LABEL,
                 lines=20,
                 elem_id="citation-button",
+                show_copy_button=True,
+            )
 scheduler = BackgroundScheduler()
 def launch_backend():
     import subprocess
     from src.backend.envs import DEVICE
+    if DEVICE not in {"cpu"}:
         _ = subprocess.run(["python", "backend-cli.py"])

backend-cli.py CHANGED Viewed

@@ -32,7 +32,9 @@ import pprint
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
         try:
-            set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
             return
         except Exception as e:
             print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
@@ -53,19 +55,32 @@ FAILED_STATUS = "FAILED"
 TASKS_HARNESS = [task.value for task in Tasks]
-my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 def sanity_checks():
-    print(f'Device: {DEVICE}')
     # pull the eval dataset from the hub and parse any eval requests
     # check completed evals and set them to finished
-    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-    check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
-                          failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
-                          hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
     return
@@ -97,29 +112,51 @@ def request_to_result_name(request: EvalRequest) -> str:
 def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
     batch_size = 2
     try:
-        results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
-                                 batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
     except RuntimeError as e:
         if "No executable batch size found" in str(e):
             batch_size = 1
-            results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
-                                     batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
         else:
             raise
-    print('RESULTS', results)
-    dumped = json.dumps(results, indent=2, default=lambda o: '<not serializable>')
     print(dumped)
-    output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
-    my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-    API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-                    repo_id=RESULTS_REPO, repo_type="dataset")
     return results
@@ -129,7 +166,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
     current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-    eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted, first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
@@ -145,7 +184,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
-            eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
             # breakpoint()
@@ -163,13 +204,37 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
                 if (eval_result is None or task_name not in eval_result.results) and do_run_task:
                     eval_request: EvalRequest = result_name_to_request[result_name]
-                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
                     results = process_evaluation(task, eval_request)
-                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
                     return True
@@ -182,7 +247,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
     current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-    eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted, first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
@@ -198,7 +265,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
-            eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
@@ -211,18 +280,46 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
                 if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
                     do_run_task = True
-                task_lst = ['nq', 'trivia', 'tqa', 'self']
-                if (eval_result is None or do_run_task or task_name not in eval_result.results or
-                        any(ss in task_name for ss in task_lst)):
                     eval_request: EvalRequest = result_name_to_request[result_name]
-                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
                     results = process_evaluation(task, eval_request)
-                    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-                    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
                     return True
@@ -235,7 +332,9 @@ def process_pending_requests() -> bool:
     current_pending_status = [PENDING_STATUS]
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted, first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
@@ -249,8 +348,16 @@ def process_pending_requests() -> bool:
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
-    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     task_lst = TASKS_HARNESS.copy()
     random.shuffle(task_lst)
@@ -258,34 +365,44 @@ def process_pending_requests() -> bool:
     for task in task_lst:
         results = process_evaluation(task, eval_request)
-    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-    my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     return True
 def get_args():
-    parser = argparse.ArgumentParser(description='Run the backend')
-    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
     return parser.parse_args()
 if __name__ == "__main__":
     args = get_args()
     local_debug = args.debug
-    #debug specific task by ping
     if local_debug:
-        debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
         # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
         # debug_task_name = 'ifeval'
-        debug_task_name = 'mmlu'
         task_lst = TASKS_HARNESS.copy()
         for task in task_lst:
             for debug_model_name in debug_model_names:
                 task_name = task.benchmark
                 if task_name != debug_task_name:
                     continue
-                eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
                 results = process_evaluation(task, eval_request)
     while True:

 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
         try:
+            set_eval_request(
+                api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir
+            )
             return
         except Exception as e:
             print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
 TASKS_HARNESS = [task.value for task in Tasks]
+my_snapshot_download(
+    repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+)
+my_snapshot_download(
+    repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+)
 def sanity_checks():
+    print(f"Device: {DEVICE}")
     # pull the eval dataset from the hub and parse any eval requests
     # check completed evals and set them to finished
+    my_snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
+    )
     return
 def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
     batch_size = 2
     try:
+        results = run_evaluation(
+            eval_request=eval_request,
+            task_names=[task.benchmark],
+            num_fewshot=task.num_fewshot,
+            batch_size=batch_size,
+            device=DEVICE,
+            use_cache=None,
+            limit=LIMIT,
+        )
     except RuntimeError as e:
         if "No executable batch size found" in str(e):
             batch_size = 1
+            results = run_evaluation(
+                eval_request=eval_request,
+                task_names=[task.benchmark],
+                num_fewshot=task.num_fewshot,
+                batch_size=batch_size,
+                device=DEVICE,
+                use_cache=None,
+                limit=LIMIT,
+            )
         else:
             raise
+    print("RESULTS", results)
+    dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
     print(dumped)
+    output_path = os.path.join(
+        EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
+    )
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
+    my_snapshot_download(
+        repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
+    API.upload_file(
+        path_or_fileobj=output_path,
+        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+        repo_id=RESULTS_REPO,
+        repo_type="dataset",
+    )
     return results
     current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(
+        job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted, first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
+            eval_result: Optional[EvalResult] = (
+                result_name_to_result[result_name] if result_name in result_name_to_result else None
+            )
             # breakpoint()
                 if (eval_result is None or task_name not in eval_result.results) and do_run_task:
                     eval_request: EvalRequest = result_name_to_request[result_name]
+                    my_snapshot_download(
+                        repo_id=QUEUE_REPO,
+                        revision="main",
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                        repo_type="dataset",
+                        max_workers=60,
+                    )
+                    my_set_eval_request(
+                        api=API,
+                        eval_request=eval_request,
+                        set_to_status=RUNNING_STATUS,
+                        hf_repo=QUEUE_REPO,
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                    )
                     results = process_evaluation(task, eval_request)
+                    my_snapshot_download(
+                        repo_id=QUEUE_REPO,
+                        revision="main",
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                        repo_type="dataset",
+                        max_workers=60,
+                    )
+                    my_set_eval_request(
+                        api=API,
+                        eval_request=eval_request,
+                        set_to_status=FINISHED_STATUS,
+                        hf_repo=QUEUE_REPO,
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                    )
                     return True
     current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(
+        job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted, first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
             result_name: str = request_to_result_name(eval_request)
             # Check the corresponding result
+            eval_result: Optional[EvalResult] = (
+                result_name_to_result[result_name] if result_name in result_name_to_result else None
+            )
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
                 if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
                     do_run_task = True
+                task_lst = ["nq", "trivia", "tqa", "self"]
+                if (
+                    eval_result is None
+                    or do_run_task
+                    or task_name not in eval_result.results
+                    or any(ss in task_name for ss in task_lst)
+                ):
                     eval_request: EvalRequest = result_name_to_request[result_name]
+                    my_snapshot_download(
+                        repo_id=QUEUE_REPO,
+                        revision="main",
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                        repo_type="dataset",
+                        max_workers=60,
+                    )
+                    my_set_eval_request(
+                        api=API,
+                        eval_request=eval_request,
+                        set_to_status=RUNNING_STATUS,
+                        hf_repo=QUEUE_REPO,
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                    )
                     results = process_evaluation(task, eval_request)
+                    my_snapshot_download(
+                        repo_id=QUEUE_REPO,
+                        revision="main",
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                        repo_type="dataset",
+                        max_workers=60,
+                    )
+                    my_set_eval_request(
+                        api=API,
+                        eval_request=eval_request,
+                        set_to_status=FINISHED_STATUS,
+                        hf_repo=QUEUE_REPO,
+                        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                    )
                     return True
     current_pending_status = [PENDING_STATUS]
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted, first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
+    my_snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
+    my_set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
     task_lst = TASKS_HARNESS.copy()
     random.shuffle(task_lst)
     for task in task_lst:
         results = process_evaluation(task, eval_request)
+    my_snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
+    my_set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=FINISHED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
     return True
 def get_args():
+    parser = argparse.ArgumentParser(description="Run the backend")
+    parser.add_argument("--debug", action="store_true", help="Run in debug mode")
     return parser.parse_args()
 if __name__ == "__main__":
     args = get_args()
     local_debug = args.debug
+    # debug specific task by ping
     if local_debug:
+        debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
         # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
         # debug_task_name = 'ifeval'
+        debug_task_name = "mmlu"
         task_lst = TASKS_HARNESS.copy()
         for task in task_lst:
             for debug_model_name in debug_model_names:
                 task_name = task.benchmark
                 if task_name != debug_task_name:
                     continue
+                eval_request = EvalRequest(
+                    model=debug_model_name, private=False, status="", json_filepath="", precision="float16"
+                )
                 results = process_evaluation(task, eval_request)
     while True:

cli/analysis-cli.py CHANGED Viewed

@@ -77,19 +77,19 @@ def sanitise_dataset(name: str) -> str:
     return res
-cache_file = 'data_map_cache.pkl'
 def load_data_map_from_cache(cache_file):
     if os.path.exists(cache_file):
-        with open(cache_file, 'rb') as f:
             return pickle.load(f)
     else:
         return None
 def save_data_map_to_cache(data_map, cache_file):
-    with open(cache_file, 'wb') as f:
         pickle.dump(data_map, f)
@@ -98,8 +98,12 @@ data_map = load_data_map_from_cache(cache_file)
 if data_map is None:
-    my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
     result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
     request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
@@ -107,7 +111,7 @@ if data_map is None:
     model_name_to_model_map = {}
     for path in request_path_lst:
-        with open(path, 'r') as f:
             data = json.load(f)
         model_name_to_model_map[data["model"]] = data
@@ -117,7 +121,7 @@ if data_map is None:
     data_map = {}
     for path in result_path_lst:
-        with open(path, 'r') as f:
             data = json.load(f)
         model_name = data["config"]["model_name"]
         for dataset_name, results_dict in data["results"].items():
@@ -127,42 +131,42 @@ if data_map is None:
                     to_add = True
-                    if 'f1' in metric_name:
                         to_add = False
-                    if 'stderr' in metric_name:
                         to_add = False
-                    if 'memo-trap_v2' in dataset_name:
                         to_add = False
-                    if 'faithdial' in dataset_name:
                         to_add = False
-                    if 'truthfulqa_gen' in dataset_name:
                         to_add = False
-                    if 'bertscore' in metric_name:
-                        if 'precision' not in metric_name:
                             to_add = False
-                    if 'halueval' in dataset_name:
-                        if 'acc' not in metric_name:
                             to_add = False
-                    if 'ifeval' in dataset_name:
-                        if 'prompt_level_strict_acc' not in metric_name:
                             to_add = False
-                    if 'squad' in dataset_name:
                         # to_add = False
-                        if 'best_exact' in metric_name:
                             to_add = False
-                    if 'fever' in dataset_name:
                         to_add = False
-                    if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' not in dataset_name:
                         to_add = False
                     if isinstance(value, str):
@@ -172,25 +176,36 @@ if data_map is None:
                             to_add = False
                     if to_add:
-                        if 'rouge' in metric_name:
                             value /= 100.0
-                        if 'squad' in dataset_name:
                             value /= 100.0
                         sanitised_metric_name = metric_name
                         if "," in sanitised_metric_name:
-                            sanitised_metric_name = sanitised_metric_name.split(',')[0]
                         sanitised_metric_name = sanitise_metric(sanitised_metric_name)
                         sanitised_dataset_name = sanitise_dataset(dataset_name)
-                        model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
                         if model_name not in data_map:
                             data_map[model_name] = {}
                         data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
-                        print('model_name', model_name, 'dataset_name', sanitised_dataset_name, 'metric_name', sanitised_metric_name, 'value', value)
     save_data_map_to_cache(data_map, cache_file)
@@ -202,7 +217,7 @@ for model_name in model_name_lst:
     if len(data_map[model_name]) < nb_max_metrics - 5:
         del data_map[model_name]
-plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
 for plot_type in plot_type_lst:
@@ -212,39 +227,39 @@ for plot_type in plot_type_lst:
             if dataset_metric not in data_map_v2:
                 data_map_v2[dataset_metric] = {}
-            if plot_type in {'all'}:
                 to_add = True
-                if 'ROUGE' in dataset_metric[1] and 'ROUGE-L' not in dataset_metric[1]:
                     to_add = False
-                if 'SQuAD' in dataset_metric[0] and 'EM' not in dataset_metric[1]:
                     to_add = False
-                if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
                     to_add = False
-                if '64-shot' in dataset_metric[0]:
                     to_add = False
                 if to_add is True:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
-            elif plot_type in {'summ'}:
-                if 'CNN' in dataset_metric[0] or 'XSum' in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
-            elif plot_type in {'qa'}:
-                if 'TriviaQA' in dataset_metric[0] or 'NQ' in dataset_metric[0] or 'TruthfulQA' in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
-            elif plot_type in {'instr'}:
-                if 'MemoTrap' in dataset_metric[0] or 'IFEval' in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
-            elif plot_type in {'detect'}:
-                if 'HaluEval' in dataset_metric[0] or 'SelfCheck' in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
-            elif plot_type in {'rc'}:
-                if 'RACE' in dataset_metric[0] or 'SQuAD' in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
             else:
                 assert False, f"Unknown plot type: {plot_type}"
     # df = pd.DataFrame.from_dict(data_map, orient='index')   # Invert the y-axis (rows)
-    df = pd.DataFrame.from_dict(data_map_v2, orient='index')   # Invert the y-axis (rows)
-    df.index = [', '.join(map(str, idx)) for idx in df.index]
     o_df = df.copy(deep=True)
@@ -263,7 +278,7 @@ for plot_type in plot_type_lst:
     # Calculate dimensions based on the DataFrame size
     cell_height = 1.0  # Height of each cell in inches
-    cell_width = 1.0   # Width of each cell in inches
     n_rows = len(df.index)  # Datasets and Metrics
     n_cols = len(df.columns)  # Models
@@ -277,60 +292,62 @@ for plot_type in plot_type_lst:
     sns.set_context("notebook", font_scale=1.3)
-    dendrogram_ratio = (.1, .1)
-    if plot_type in {'detect'}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
-        dendrogram_ratio = (.1, .2)
-    if plot_type in {'instr'}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
-        dendrogram_ratio = (.1, .4)
-    if plot_type in {'qa'}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 4
-        dendrogram_ratio = (.1, .2)
-    if plot_type in {'summ'}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 2.0
-        dendrogram_ratio = (.1, .1)
         row_cluster = False
-    if plot_type in {'rc'}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
-        dendrogram_ratio = (.1, .4)
-    print('figsize', (fig_width, fig_height))
-    o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
-    print(f'Generating the clustermaps for {plot_type}')
-    for cmap in [None, 'coolwarm', 'viridis']:
-        fig = sns.clustermap(df,
-                             method='ward',
-                             metric='euclidean',
-                             cmap=cmap,
-                             figsize=(fig_width, fig_height),  # figsize=(24, 16),
-                             annot=True,
-                             mask=o_df.isnull(),
-                             dendrogram_ratio=dendrogram_ratio,
-                             fmt='.2f',
-                             col_cluster=col_cluster,
-                             row_cluster=row_cluster)
         # Adjust the size of the cells (less wide)
         plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
         plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
-        cmap_suffix = '' if cmap is None else f'_{cmap}'
         # Save the clustermap to file
-        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf')
-        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.png')
-        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")

     return res
+cache_file = "data_map_cache.pkl"
 def load_data_map_from_cache(cache_file):
     if os.path.exists(cache_file):
+        with open(cache_file, "rb") as f:
             return pickle.load(f)
     else:
         return None
 def save_data_map_to_cache(data_map, cache_file):
+    with open(cache_file, "wb") as f:
         pickle.dump(data_map, f)
 if data_map is None:
+    my_snapshot_download(
+        repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
+    my_snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
     result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
     request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
     model_name_to_model_map = {}
     for path in request_path_lst:
+        with open(path, "r") as f:
             data = json.load(f)
         model_name_to_model_map[data["model"]] = data
     data_map = {}
     for path in result_path_lst:
+        with open(path, "r") as f:
             data = json.load(f)
         model_name = data["config"]["model_name"]
         for dataset_name, results_dict in data["results"].items():
                     to_add = True
+                    if "f1" in metric_name:
                         to_add = False
+                    if "stderr" in metric_name:
                         to_add = False
+                    if "memo-trap_v2" in dataset_name:
                         to_add = False
+                    if "faithdial" in dataset_name:
                         to_add = False
+                    if "truthfulqa_gen" in dataset_name:
                         to_add = False
+                    if "bertscore" in metric_name:
+                        if "precision" not in metric_name:
                             to_add = False
+                    if "halueval" in dataset_name:
+                        if "acc" not in metric_name:
                             to_add = False
+                    if "ifeval" in dataset_name:
+                        if "prompt_level_strict_acc" not in metric_name:
                             to_add = False
+                    if "squad" in dataset_name:
                         # to_add = False
+                        if "best_exact" in metric_name:
                             to_add = False
+                    if "fever" in dataset_name:
                         to_add = False
+                    if ("xsum" in dataset_name or "cnn" in dataset_name) and "v2" not in dataset_name:
                         to_add = False
                     if isinstance(value, str):
                             to_add = False
                     if to_add:
+                        if "rouge" in metric_name:
                             value /= 100.0
+                        if "squad" in dataset_name:
                             value /= 100.0
                         sanitised_metric_name = metric_name
                         if "," in sanitised_metric_name:
+                            sanitised_metric_name = sanitised_metric_name.split(",")[0]
                         sanitised_metric_name = sanitise_metric(sanitised_metric_name)
                         sanitised_dataset_name = sanitise_dataset(dataset_name)
+                        model_dataset_metric_to_result_map[
+                            (model_name, sanitised_dataset_name, sanitised_metric_name)
+                        ] = value
                         if model_name not in data_map:
                             data_map[model_name] = {}
                         data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
+                        print(
+                            "model_name",
+                            model_name,
+                            "dataset_name",
+                            sanitised_dataset_name,
+                            "metric_name",
+                            sanitised_metric_name,
+                            "value",
+                            value,
+                        )
     save_data_map_to_cache(data_map, cache_file)
     if len(data_map[model_name]) < nb_max_metrics - 5:
         del data_map[model_name]
+plot_type_lst = ["all", "summ", "qa", "instr", "detect", "rc"]
 for plot_type in plot_type_lst:
             if dataset_metric not in data_map_v2:
                 data_map_v2[dataset_metric] = {}
+            if plot_type in {"all"}:
                 to_add = True
+                if "ROUGE" in dataset_metric[1] and "ROUGE-L" not in dataset_metric[1]:
                     to_add = False
+                if "SQuAD" in dataset_metric[0] and "EM" not in dataset_metric[1]:
                     to_add = False
+                if "SelfCheckGPT" in dataset_metric[0] and "MAX" not in dataset_metric[1]:
                     to_add = False
+                if "64-shot" in dataset_metric[0]:
                     to_add = False
                 if to_add is True:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
+            elif plot_type in {"summ"}:
+                if "CNN" in dataset_metric[0] or "XSum" in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
+            elif plot_type in {"qa"}:
+                if "TriviaQA" in dataset_metric[0] or "NQ" in dataset_metric[0] or "TruthfulQA" in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
+            elif plot_type in {"instr"}:
+                if "MemoTrap" in dataset_metric[0] or "IFEval" in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
+            elif plot_type in {"detect"}:
+                if "HaluEval" in dataset_metric[0] or "SelfCheck" in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
+            elif plot_type in {"rc"}:
+                if "RACE" in dataset_metric[0] or "SQuAD" in dataset_metric[0]:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
             else:
                 assert False, f"Unknown plot type: {plot_type}"
     # df = pd.DataFrame.from_dict(data_map, orient='index')   # Invert the y-axis (rows)
+    df = pd.DataFrame.from_dict(data_map_v2, orient="index")  # Invert the y-axis (rows)
+    df.index = [", ".join(map(str, idx)) for idx in df.index]
     o_df = df.copy(deep=True)
     # Calculate dimensions based on the DataFrame size
     cell_height = 1.0  # Height of each cell in inches
+    cell_width = 1.0  # Width of each cell in inches
     n_rows = len(df.index)  # Datasets and Metrics
     n_cols = len(df.columns)  # Models
     sns.set_context("notebook", font_scale=1.3)
+    dendrogram_ratio = (0.1, 0.1)
+    if plot_type in {"detect"}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
+        dendrogram_ratio = (0.1, 0.2)
+    if plot_type in {"instr"}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
+        dendrogram_ratio = (0.1, 0.4)
+    if plot_type in {"qa"}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 4
+        dendrogram_ratio = (0.1, 0.2)
+    if plot_type in {"summ"}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 2.0
+        dendrogram_ratio = (0.1, 0.1)
         row_cluster = False
+    if plot_type in {"rc"}:
         fig_width = cell_width * n_cols - 2
         fig_height = cell_height * n_rows + 5.2
+        dendrogram_ratio = (0.1, 0.4)
+    print("figsize", (fig_width, fig_height))
+    o_df.to_json(f"plots/clustermap_{plot_type}.json", orient="split")
+    print(f"Generating the clustermaps for {plot_type}")
+    for cmap in [None, "coolwarm", "viridis"]:
+        fig = sns.clustermap(
+            df,
+            method="ward",
+            metric="euclidean",
+            cmap=cmap,
+            figsize=(fig_width, fig_height),  # figsize=(24, 16),
+            annot=True,
+            mask=o_df.isnull(),
+            dendrogram_ratio=dendrogram_ratio,
+            fmt=".2f",
+            col_cluster=col_cluster,
+            row_cluster=row_cluster,
+        )
         # Adjust the size of the cells (less wide)
         plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
         plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
+        cmap_suffix = "" if cmap is None else f"_{cmap}"
         # Save the clustermap to file
+        fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf")
+        fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.png")
+        fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png", transparent=True, facecolor="none")

cli/averitec-upload-cli.py CHANGED Viewed

@@ -2,11 +2,13 @@
 from datasets import load_dataset
-path = 'pminervini/averitec'
-ds = load_dataset("json",
-                  data_files={
-                      'train': '/Users/pasquale/workspace/AVeriTeC/data/train.json',
-                      'dev': '/Users/pasquale/workspace/AVeriTeC/data/dev.json'
-                  })
 ds.push_to_hub(path)

 from datasets import load_dataset
+path = "pminervini/averitec"
+ds = load_dataset(
+    "json",
+    data_files={
+        "train": "/Users/pasquale/workspace/AVeriTeC/data/train.json",
+        "dev": "/Users/pasquale/workspace/AVeriTeC/data/dev.json",
+    },
+)
 ds.push_to_hub(path)

cli/beta-cli.py CHANGED Viewed

@@ -14,8 +14,12 @@ from src.leaderboard.read_evals import get_raw_eval_results
 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
-snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
-snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 PENDING_STATUS = "PENDING"
 RUNNING_STATUS = "RUNNING"
@@ -40,7 +44,9 @@ def request_to_result_name(request: EvalRequest) -> str:
 # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 # Sort the evals by priority (first submitted first run)
 eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
@@ -49,8 +55,8 @@ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_RE
 result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
 result_name_to_result = {r.eval_name: r for r in eval_results}
-print('Requests', sorted(result_name_to_request.keys()))
-print('Results', sorted(result_name_to_result.keys()))
 for eval_request in eval_requests:
     result_name: str = request_to_result_name(eval_request)
@@ -63,7 +69,7 @@ for eval_request in eval_requests:
         task_name = task.benchmark
         if task_name not in eval_result.results:
-            print('RUN THIS ONE!', result_name, task_name)
 raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
 all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]

 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
+snapshot_download(
+    repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+)
+snapshot_download(
+    repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+)
 PENDING_STATUS = "PENDING"
 RUNNING_STATUS = "RUNNING"
 # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+eval_requests: list[EvalRequest] = get_eval_requests(
+    job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+)
 # Sort the evals by priority (first submitted first run)
 eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
 result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
 result_name_to_result = {r.eval_name: r for r in eval_results}
+print("Requests", sorted(result_name_to_request.keys()))
+print("Results", sorted(result_name_to_result.keys()))
 for eval_request in eval_requests:
     result_name: str = request_to_result_name(eval_request)
         task_name = task.benchmark
         if task_name not in eval_result.results:
+            print("RUN THIS ONE!", result_name, task_name)
 raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
 all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]

cli/completed-cli.py CHANGED Viewed

@@ -26,8 +26,12 @@ FAILED_STATUS = "FAILED"
 TASKS_HARNESS = [task.value for task in Tasks]
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 def request_to_result_name(request: EvalRequest) -> str:
@@ -48,9 +52,10 @@ def process_finished_requests() -> bool:
     if False:
         import os
         import dateutil
         model_result_filepaths = []
-        results_path = f'{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B'
-        requests_path = f'{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json'
         for root, _, files in os.walk(results_path):
             # We should only have json files in model results
@@ -72,7 +77,7 @@ def process_finished_requests() -> bool:
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
             eval_result.update_with_request_file(requests_path)
-            print('XXX', eval_result)
             # Store results of same eval together
             eval_name = eval_result.eval_name
@@ -86,7 +91,9 @@ def process_finished_requests() -> bool:
         return True
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-    eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
@@ -94,9 +101,11 @@ def process_finished_requests() -> bool:
     # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
     import random
     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
     eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
@@ -107,7 +116,10 @@ def process_finished_requests() -> bool:
         # Check the corresponding result
         from typing import Optional
-        eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
         # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
         for task in TASKS_HARNESS:
@@ -117,7 +129,7 @@ def process_finished_requests() -> bool:
                 eval_request: EvalRequest = result_name_to_request[result_name]
                 # print(eval_result)
-                print(result_name, 'is incomplete -- missing task:', task_name, eval_result, eval_request.likes)
 if __name__ == "__main__":

 TASKS_HARNESS = [task.value for task in Tasks]
+snapshot_download(
+    repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+)
+snapshot_download(
+    repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+)
 def request_to_result_name(request: EvalRequest) -> str:
     if False:
         import os
         import dateutil
         model_result_filepaths = []
+        results_path = f"{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B"
+        requests_path = f"{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json"
         for root, _, files in os.walk(results_path):
             # We should only have json files in model results
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
             eval_result.update_with_request_file(requests_path)
+            print("XXX", eval_result)
             # Store results of same eval together
             eval_name = eval_result.eval_name
         return True
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(
+        job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
     # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
     import random
     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
     eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
         # Check the corresponding result
         from typing import Optional
+        eval_result: Optional[EvalResult] = (
+            result_name_to_result[result_name] if result_name in result_name_to_result else None
+        )
         # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
         for task in TASKS_HARNESS:
                 eval_request: EvalRequest = result_name_to_request[result_name]
                 # print(eval_result)
+                print(result_name, "is incomplete -- missing task:", task_name, eval_result, eval_request.likes)
 if __name__ == "__main__":

cli/eval-cli.py CHANGED Viewed

@@ -35,12 +35,11 @@ def main():
     status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
-    eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
-                                                         hf_repo=QUEUE_REPO,
-                                                         local_dir=EVAL_REQUESTS_PATH_BACKEND,
-                                                         do_download=False)
     # eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
-    eval_request = [r for r in eval_requests if 'meta-llama/Llama-2-7b-hf' in r.model][0]
     # my_task = Task("memo-trap", "acc", "memo-trap", 0)
     # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
@@ -56,6 +55,7 @@ def main():
     eval_logger = utils.eval_logger
     import logging
     eval_logger.setLevel(getattr(logging, "DEBUG"))
     TASKS_HARNESS = [my_task]
@@ -75,9 +75,19 @@ def main():
         import torch
         # breakpoint()
-        results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
-                                            batch_size=1, device="mps", use_cache=None, limit=2, write_out=True, task_manager=task_manager)
-        print('AAA', results["results"])
         breakpoint()

     status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
+    eval_requests: list[EvalRequest] = get_eval_requests(
+        job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND, do_download=False
+    )
     # eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
+    eval_request = [r for r in eval_requests if "meta-llama/Llama-2-7b-hf" in r.model][0]
     # my_task = Task("memo-trap", "acc", "memo-trap", 0)
     # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
     eval_logger = utils.eval_logger
     import logging
     eval_logger.setLevel(getattr(logging, "DEBUG"))
     TASKS_HARNESS = [my_task]
         import torch
         # breakpoint()
+        results = evaluator.simple_evaluate(
+            model="hf",
+            model_args=eval_request.get_model_args(),
+            tasks=[task.benchmark],
+            num_fewshot=task.num_fewshot,
+            batch_size=1,
+            device="mps",
+            use_cache=None,
+            limit=2,
+            write_out=True,
+            task_manager=task_manager,
+        )
+        print("AAA", results["results"])
         breakpoint()

cli/fever-upload-cli.py CHANGED Viewed

@@ -18,12 +18,9 @@ def convert(list_of_dicts):
 v10 = load_dataset("fever", "v1.0")
-name_lst = ['train', 'labelled_dev']
-old_to_new_label_map = {
-    'SUPPORTS': 'supported',
-    'REFUTES': 'refuted'
-}
 data_map = {}
@@ -31,28 +28,28 @@ for name in name_lst:
     instance_lst = []
     for entry in tqdm(v10[name]):
-        id_ = entry['id']
-        label = entry['label']
-        claim = entry['claim']
-        evidence_id = entry['evidence_id']
-        evidence_wiki_url = entry['evidence_wiki_url']
         if evidence_id != -1:
-            assert label in {'SUPPORTS', 'REFUTES'}
-            instance = {'id': id_, 'label': old_to_new_label_map[label], 'claim': claim}
             instance_lst.append(instance)
-    key = 'dev' if name in {'labelled_dev'} else name
-    instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d['claim'])
     label_to_instance_lst = {}
     for e in instance_lst:
-        if e['label'] not in label_to_instance_lst:
-            label_to_instance_lst[e['label']] = []
-        label_to_instance_lst[e['label']].append(e)
     min_len = min(len(v) for k, v in label_to_instance_lst.items())
@@ -63,7 +60,7 @@ for name in name_lst:
     random.Random(42).shuffle(new_instance_lst)
     data_map[key] = new_instance_lst
-ds_path = 'pminervini/hl-fever'
 task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
 ds_dict = DatasetDict(task_to_ds_map)

 v10 = load_dataset("fever", "v1.0")
+name_lst = ["train", "labelled_dev"]
+old_to_new_label_map = {"SUPPORTS": "supported", "REFUTES": "refuted"}
 data_map = {}
     instance_lst = []
     for entry in tqdm(v10[name]):
+        id_ = entry["id"]
+        label = entry["label"]
+        claim = entry["claim"]
+        evidence_id = entry["evidence_id"]
+        evidence_wiki_url = entry["evidence_wiki_url"]
         if evidence_id != -1:
+            assert label in {"SUPPORTS", "REFUTES"}
+            instance = {"id": id_, "label": old_to_new_label_map[label], "claim": claim}
             instance_lst.append(instance)
+    key = "dev" if name in {"labelled_dev"} else name
+    instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d["claim"])
     label_to_instance_lst = {}
     for e in instance_lst:
+        if e["label"] not in label_to_instance_lst:
+            label_to_instance_lst[e["label"]] = []
+        label_to_instance_lst[e["label"]].append(e)
     min_len = min(len(v) for k, v in label_to_instance_lst.items())
     random.Random(42).shuffle(new_instance_lst)
     data_map[key] = new_instance_lst
+ds_path = "pminervini/hl-fever"
 task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
 ds_dict = DatasetDict(task_to_ds_map)

cli/fix-requests-cli.py CHANGED Viewed

@@ -10,12 +10,12 @@ from huggingface_hub import HfApi
 def find_json_files(directory):
     matches = []
     for root, dirnames, filenames in os.walk(directory):
-        for filename in fnmatch.filter(filenames, '*.json'):
             matches.append(os.path.join(root, filename))
     return matches
-directory_path = '/Users/pasquale/workspace/eval/requests'
 json_files = find_json_files(directory_path)
 api = HfApi()
@@ -26,29 +26,29 @@ model_lst = [m for m in model_lst]
 id_to_model = {m.id: m for m in model_lst}
 for path in json_files:
-    with open(path, 'r') as fr:
         data = json.load(fr)
-        model_id = data['model']
         if model_id in id_to_model:
             model = id_to_model[model_id]
             to_overwrite = False
-            is_finetuned = any(tag.startswith('base_model:') for tag in id_to_model[data['model']].tags)
             if is_finetuned:
                 data["model_type"] = "fine-tuned"
                 to_overwrite = True
-            is_instruction_tuned = ('nstruct' in model_id) or ('chat' in model_id)
             if is_instruction_tuned:
                 data["model_type"] = "instruction-tuned"
                 to_overwrite = True
             if to_overwrite is True:
-                with open(path, 'w') as fw:
                     json.dump(data, fw)
         else:
-            print(f'Model {model_id} not found')

 def find_json_files(directory):
     matches = []
     for root, dirnames, filenames in os.walk(directory):
+        for filename in fnmatch.filter(filenames, "*.json"):
             matches.append(os.path.join(root, filename))
     return matches
+directory_path = "/Users/pasquale/workspace/eval/requests"
 json_files = find_json_files(directory_path)
 api = HfApi()
 id_to_model = {m.id: m for m in model_lst}
 for path in json_files:
+    with open(path, "r") as fr:
         data = json.load(fr)
+        model_id = data["model"]
         if model_id in id_to_model:
             model = id_to_model[model_id]
             to_overwrite = False
+            is_finetuned = any(tag.startswith("base_model:") for tag in id_to_model[data["model"]].tags)
             if is_finetuned:
                 data["model_type"] = "fine-tuned"
                 to_overwrite = True
+            is_instruction_tuned = ("nstruct" in model_id) or ("chat" in model_id)
             if is_instruction_tuned:
                 data["model_type"] = "instruction-tuned"
                 to_overwrite = True
             if to_overwrite is True:
+                with open(path, "w") as fw:
                     json.dump(data, fw)
         else:
+            print(f"Model {model_id} not found")

cli/halueval-upload-cli.py CHANGED Viewed

@@ -6,20 +6,20 @@ import requests
 from datasets import load_dataset, Dataset, DatasetDict
-path = 'pminervini/HaluEval'
 API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
 response = requests.get(API_URL)
 res_json = response.json()
-gold_splits = {'dialogue', 'qa', 'summarization', 'general'}
-available_splits = {split['config'] for split in res_json['splits']} if 'splits' in res_json else set()
 name_to_ds = dict()
 for name in gold_splits:
-    ds = load_dataset("json", data_files={'data': f"data/{name}_data.json"})
     name_to_ds[name] = ds
     # if name not in available_splits:
     ds.push_to_hub(path, config_name=name)
@@ -35,38 +35,38 @@ def list_to_dict(lst: list) -> dict:
     return res
-for name in (gold_splits - {'general'}):
     random.seed(42)
     ds = name_to_ds[name]
     new_entry_lst = []
-    for entry in ds['data']:
         is_hallucinated = random.random() > 0.5
         new_entry = None
-        if name in {'qa'}:
             new_entry = {
-                'knowledge': entry['knowledge'],
-                'question': entry['question'],
-                'answer': entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
-                'hallucination': 'yes' if is_hallucinated else 'no'
             }
-        if name in {'dialogue'}:
             new_entry = {
-                'knowledge': entry['knowledge'],
-                'dialogue_history': entry['dialogue_history'],
-                'response': entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
-                'hallucination': 'yes' if is_hallucinated else 'no'
             }
-        if name in {'summarization'}:
             new_entry = {
-                'document': entry['document'],
-                'summary': entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
-                'hallucination': 'yes' if is_hallucinated else 'no'
             }
         assert new_entry is not None
         new_entry_lst += [new_entry]
     new_ds_map = list_to_dict(new_entry_lst)
     new_ds = Dataset.from_dict(new_ds_map)
-    new_dsd = DatasetDict({'data': new_ds})
-    new_dsd.push_to_hub(path, config_name=f'{name}_samples')

 from datasets import load_dataset, Dataset, DatasetDict
+path = "pminervini/HaluEval"
 API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
 response = requests.get(API_URL)
 res_json = response.json()
+gold_splits = {"dialogue", "qa", "summarization", "general"}
+available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set()
 name_to_ds = dict()
 for name in gold_splits:
+    ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"})
     name_to_ds[name] = ds
     # if name not in available_splits:
     ds.push_to_hub(path, config_name=name)
     return res
+for name in gold_splits - {"general"}:
     random.seed(42)
     ds = name_to_ds[name]
     new_entry_lst = []
+    for entry in ds["data"]:
         is_hallucinated = random.random() > 0.5
         new_entry = None
+        if name in {"qa"}:
             new_entry = {
+                "knowledge": entry["knowledge"],
+                "question": entry["question"],
+                "answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
+                "hallucination": "yes" if is_hallucinated else "no",
             }
+        if name in {"dialogue"}:
             new_entry = {
+                "knowledge": entry["knowledge"],
+                "dialogue_history": entry["dialogue_history"],
+                "response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
+                "hallucination": "yes" if is_hallucinated else "no",
             }
+        if name in {"summarization"}:
             new_entry = {
+                "document": entry["document"],
+                "summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
+                "hallucination": "yes" if is_hallucinated else "no",
             }
         assert new_entry is not None
         new_entry_lst += [new_entry]
     new_ds_map = list_to_dict(new_entry_lst)
     new_ds = Dataset.from_dict(new_ds_map)
+    new_dsd = DatasetDict({"data": new_ds})
+    new_dsd.push_to_hub(path, config_name=f"{name}_samples")

cli/isp-upload-cli.py CHANGED Viewed

@@ -5,16 +5,16 @@ import os
 from datasets import load_dataset
-folder_path = 'isp-data-json/'  # Replace with your folder path
 # Search for all .json files in the folder
-json_files = glob.glob(os.path.join(folder_path, '*.jsonl'))
-path = 'pminervini/inverse-scaling'
 for json_path in json_files:
     base_name = os.path.basename(json_path)
     name = base_name.split("_")[0]
-    ds = load_dataset("json", data_files={'data': json_path})
     ds.push_to_hub(path, config_name=name)

 from datasets import load_dataset
+folder_path = "isp-data-json/"  # Replace with your folder path
 # Search for all .json files in the folder
+json_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
+path = "pminervini/inverse-scaling"
 for json_path in json_files:
     base_name = os.path.basename(json_path)
     name = base_name.split("_")[0]
+    ds = load_dataset("json", data_files={"data": json_path})
     ds.push_to_hub(path, config_name=name)

cli/nqswap-upload-cli.py CHANGED Viewed

@@ -2,11 +2,7 @@
 from datasets import load_dataset
-path = 'pminervini/NQ-Swap'
-ds = load_dataset("json",
-                  data_files={
-                      'original': 'nqswap/original.jsonl',
-                      'substituted': 'nqswap/substituted.jsonl'
-                  })
 ds.push_to_hub(path)

 from datasets import load_dataset
+path = "pminervini/NQ-Swap"
+ds = load_dataset("json", data_files={"original": "nqswap/original.jsonl", "substituted": "nqswap/substituted.jsonl"})
 ds.push_to_hub(path)

cli/shroom-upload-cli.py CHANGED Viewed

@@ -4,9 +4,9 @@ import json
 from datasets import Dataset, DatasetDict
 file_path = "shroom-data/val.model-agnostic.json"
-ds_path = 'pminervini/shroom'
-with open(file_path, 'r') as file:
     data = json.load(file)
@@ -15,7 +15,7 @@ def convert(list_of_dicts):
     for d in list_of_dicts:
         for key, value in d.items():
             dict_of_lists.setdefault(key, []).append(value)
-    return  dict_of_lists
 task_to_data_map = {}

 from datasets import Dataset, DatasetDict
 file_path = "shroom-data/val.model-agnostic.json"
+ds_path = "pminervini/shroom"
+with open(file_path, "r") as file:
     data = json.load(file)
     for d in list_of_dicts:
         for key, value in d.items():
             dict_of_lists.setdefault(key, []).append(value)
+    return dict_of_lists
 task_to_data_map = {}

cli/submit-cli.py CHANGED Viewed

@@ -15,7 +15,9 @@ from src.backend.manage_requests import get_eval_requests
 from src.backend.manage_requests import EvalRequest
-def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
     REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
@@ -37,7 +39,9 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             print(f'Base model "{base_model}" {error}')
             return
@@ -57,7 +61,7 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
     model_size = get_model_size(model_info=model_info, precision=precision)
-    license = 'none'
     try:
         license = model_info.cardData["license"]
     except Exception:
@@ -101,13 +105,20 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
-    API.upload_file(path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1],
-                    repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model} to eval queue")
     # Remove the local file
     os.remove(out_path)
-    print("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
     return
@@ -122,12 +133,14 @@ def main():
     def custom_filter(m) -> bool:
         # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
         # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
-        res = 'mistralai/' in m.id
         return res
     filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
     PENDING_STATUS = "PENDING"
     RUNNING_STATUS = "RUNNING"
@@ -137,7 +150,9 @@ def main():
     status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval requests
-    eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     requested_model_names = {e.model for e in eval_requests}
@@ -146,25 +161,33 @@ def main():
     for i in range(min(200, len(filtered_model_lst))):
         model = filtered_model_lst[i]
-        print(f'Considering {model.id} ..')
-        is_finetuned = any(tag.startswith('base_model:') for tag in model.tags)
-        model_type = 'pretrained'
         if is_finetuned:
             model_type = "fine-tuned"
-        is_instruction_tuned = 'nstruct' in model.id
         if is_instruction_tuned:
             model_type = "instruction-tuned"
         if model.id not in requested_model_names:
-            if 'mage' not in model.id:
-                add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
                 time.sleep(10)
         else:
-            print(f'Model {model.id} already added, not adding it to the queue again.')
 if __name__ == "__main__":

 from src.backend.manage_requests import EvalRequest
+def add_new_eval(
+    model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str
+):
     REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
+        )
         if not base_model_on_hub:
             print(f'Base model "{base_model}" {error}')
             return
     model_size = get_model_size(model_info=model_info, precision=precision)
+    license = "none"
     try:
         license = model_info.cardData["license"]
     except Exception:
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
+    API.upload_file(
+        path_or_fileobj=out_path,
+        path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model} to eval queue",
+    )
     # Remove the local file
     os.remove(out_path)
+    print(
+        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
+    )
     return
     def custom_filter(m) -> bool:
         # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
         # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
+        res = "mistralai/" in m.id
         return res
     filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
+    snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
+    )
     PENDING_STATUS = "PENDING"
     RUNNING_STATUS = "RUNNING"
     status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
     # Get all eval requests
+    eval_requests: list[EvalRequest] = get_eval_requests(
+        job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     requested_model_names = {e.model for e in eval_requests}
     for i in range(min(200, len(filtered_model_lst))):
         model = filtered_model_lst[i]
+        print(f"Considering {model.id} ..")
+        is_finetuned = any(tag.startswith("base_model:") for tag in model.tags)
+        model_type = "pretrained"
         if is_finetuned:
             model_type = "fine-tuned"
+        is_instruction_tuned = "nstruct" in model.id
         if is_instruction_tuned:
             model_type = "instruction-tuned"
         if model.id not in requested_model_names:
+            if "mage" not in model.id:
+                add_new_eval(
+                    model=model.id,
+                    base_model="",
+                    revision="main",
+                    precision="float32",
+                    private=False,
+                    weight_type="Original",
+                    model_type=model_type,
+                )
                 time.sleep(10)
         else:
+            print(f"Model {model.id} already added, not adding it to the queue again.")
 if __name__ == "__main__":

cli/sync-open-llm-cli.py CHANGED Viewed

@@ -10,6 +10,7 @@ from src.envs import QUEUE_REPO, API
 from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
 from src.utils import my_snapshot_download
 def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
     for i in range(10):
         try:
@@ -29,8 +30,12 @@ def set_eval_request(api: HfApi, json_filepath: str, hf_repo: str, local_dir: st
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
-    api.upload_file(path_or_fileobj=json_filepath, path_in_repo=json_filepath.replace(local_dir, ""),
-                    repo_id=hf_repo, repo_type="dataset")
 def get_request_file_for_model(data, requests_path):
@@ -54,6 +59,7 @@ def get_request_file_for_model(data, requests_path):
                 request_file = tmp_request_file
     return request_file
 def update_model_type(data, requests_path):
     open_llm_request_file = get_request_file_for_model(data, requests_path)
@@ -71,21 +77,33 @@ def read_and_write_json_files(directory, requests_path_open_llm):
     for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
         for file in files:
             # Check if the file is a JSON file
-            if file.endswith('.json'):
                 file_path = os.path.join(subdir, file)
                 # Open and read the JSON file
-                with open(file_path, 'r') as json_file:
                     data = json.load(json_file)
                 sucess, data = update_model_type(data, requests_path_open_llm)
                 if sucess:
-                    with open(file_path, 'w') as json_file:
                         json.dump(data, json_file)
-                    my_set_eval_request(api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC)
 if __name__ == "__main__":
-    my_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, revision="main", local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", max_workers=60)
-    my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC, repo_type="dataset", max_workers=60)
-    read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)

 from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
 from src.utils import my_snapshot_download
 def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
     for i in range(10):
         try:
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
 def get_request_file_for_model(data, requests_path):
                 request_file = tmp_request_file
     return request_file
 def update_model_type(data, requests_path):
     open_llm_request_file = get_request_file_for_model(data, requests_path)
     for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
         for file in files:
             # Check if the file is a JSON file
+            if file.endswith(".json"):
                 file_path = os.path.join(subdir, file)
                 # Open and read the JSON file
+                with open(file_path, "r") as json_file:
                     data = json.load(json_file)
                 sucess, data = update_model_type(data, requests_path_open_llm)
                 if sucess:
+                    with open(file_path, "w") as json_file:
                         json.dump(data, json_file)
+                    my_set_eval_request(
+                        api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC
+                    )
 if __name__ == "__main__":
+    my_snapshot_download(
+        repo_id=QUEUE_REPO_OPEN_LLM,
+        revision="main",
+        local_dir=EVAL_REQUESTS_PATH_OPEN_LLM,
+        repo_type="dataset",
+        max_workers=60,
+    )
+    my_snapshot_download(
+        repo_id=QUEUE_REPO,
+        revision="main",
+        local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC,
+        repo_type="dataset",
+        max_workers=60,
+    )
+    read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)

cli/truefalse-upload-cli.py CHANGED Viewed

@@ -5,11 +5,11 @@ import os
 from datasets import load_dataset
-path = 'pminervini/true-false'
-folder_path = 'true-false-data/'  # Replace with your folder path
 # Search for all .json files in the folder
-csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
 ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
 ds.push_to_hub(path)

 from datasets import load_dataset
+path = "pminervini/true-false"
+folder_path = "true-false-data/"  # Replace with your folder path
 # Search for all .json files in the folder
+csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
 ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
 ds.push_to_hub(path)

src/backend/envs.py CHANGED Viewed

@@ -63,6 +63,6 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
 LIMIT = None  # Testing; needs to be None

 EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 LIMIT = None  # Testing; needs to be None

src/backend/huggingface_generate_until.py CHANGED Viewed

@@ -5,7 +5,8 @@ import transformers
 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.registry import register_model
-@register_model('hf-chat')
 class HFLMwithChatTemplate(HFLM):
     def __init__(self, use_chat_template=True, **kwargs):
         super().__init__(**kwargs)
@@ -49,9 +50,7 @@ class HFLMwithChatTemplate(HFLM):
         )
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-            encoding["attention_mask"] = encoding["attention_mask"][
-                :, -left_truncate_len:
-            ]
         self.tokenizer.padding_side = old_padding_side
-        return encoding["input_ids"], encoding["attention_mask"]

 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.registry import register_model
+@register_model("hf-chat")
 class HFLMwithChatTemplate(HFLM):
     def __init__(self, use_chat_template=True, **kwargs):
         super().__init__(**kwargs)
         )
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
         self.tokenizer.padding_side = old_padding_side
+        return encoding["input_ids"], encoding["attention_mask"]

src/backend/manage_requests.py CHANGED Viewed

@@ -17,24 +17,27 @@ class EvalRequest:
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16
-    base_model: Optional[str] = None # for adapter models
-    revision: str = "main" # commit
-    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
     model_type: Optional[str] = None
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     def get_model_args(self) -> str:
-        model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"
-        # Quantized models need some added config, the install of bits and bytes, etc
-        #elif self.precision == "8bit":
-        #    model_args += ",load_in_8bit=True"
-        #elif self.precision == "4bit":
-        #    model_args += ",load_in_4bit=True"
-        #elif self.precision == "GPTQ":
             # A GPTQ model does not need dtype to be specified,
             # it will be inferred from the config
             pass
@@ -55,8 +58,12 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
-    api.upload_file(path_or_fileobj=json_filepath, path_in_repo=json_filepath.replace(local_dir, ""),
-                    repo_id=hf_repo, repo_type="dataset")
 def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
@@ -68,7 +75,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
         `list[EvalRequest]`: a list of model info dicts.
     """
     if do_download:
-        my_snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
@@ -81,8 +90,8 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
             # breakpoint()
             data["json_filepath"] = json_filepath
-            if 'job_id' in data:
-                del data['job_id']
             eval_request = EvalRequest(**data)
             eval_requests.append(eval_request)
@@ -90,10 +99,20 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
     return eval_requests
-def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_status: str, completed_status: str,
-                          failed_status: str, hf_repo_results: str, local_dir_results: str):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
-    my_snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
     running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
@@ -109,5 +128,3 @@ def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_stat
         if output_file_exists:
             print(f"EXISTS output file exists for {model} setting it to {completed_status}")
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)

     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16
+    base_model: Optional[str] = None  # for adapter models
+    revision: str = "main"  # commit
+    submitted_time: Optional[str] = (
+        "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    )
     model_type: Optional[str] = None
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     def get_model_args(self) -> str:
+        model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"  # ,max_length=4096"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"
+            # Quantized models need some added config, the install of bits and bytes, etc
+            # elif self.precision == "8bit":
+            #    model_args += ",load_in_8bit=True"
+            # elif self.precision == "4bit":
+            #    model_args += ",load_in_4bit=True"
+            # elif self.precision == "GPTQ":
             # A GPTQ model does not need dtype to be specified,
             # it will be inferred from the config
             pass
     with open(json_filepath, "w") as f:
         f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
 def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
         `list[EvalRequest]`: a list of model info dicts.
     """
     if do_download:
+        my_snapshot_download(
+            repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60
+        )
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
             # breakpoint()
             data["json_filepath"] = json_filepath
+            if "job_id" in data:
+                del data["job_id"]
             eval_request = EvalRequest(**data)
             eval_requests.append(eval_request)
     return eval_requests
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    my_snapshot_download(
+        repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60
+    )
     running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
         if output_file_exists:
             print(f"EXISTS output file exists for {model} setting it to {completed_status}")
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)

src/backend/moe_infinity.py CHANGED Viewed

@@ -8,17 +8,18 @@ from typing import List, Tuple, Optional, Union
 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.registry import register_model
-@register_model('moe-infinity')
 class MoEHFLM(HFLM):
     def __init__(
         self,
         pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
         moe_config: dict = None,
-        offload_path = os.path.expanduser('~'),
-        device_memory_ratio = 0.75,
         use_chat_template=True,
         *args,
-        **kwargs
     ):
         # Initialize parent class without calling _create_model in the parent's __init__
         self.checkpoint = pretrained
@@ -28,7 +29,9 @@ class MoEHFLM(HFLM):
         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
-        super().__init__(*args, **kwargs, pretrained=pretrained, device_map="cuda:0")  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
     def _create_model(self, *args, **kwargs):
@@ -43,7 +46,9 @@ class MoEHFLM(HFLM):
         # Update default config with any user-provided config
         final_moe_config = {**default_moe_config, **self.moe_config}
         # self._model = MoE(self.checkpoint, final_moe_config)
-        self._model = AutoModelForCausalLM.from_pretrained(self.checkpoint, torch_dtype=torch.float16, device_map="auto")
     @property
     def max_length(self):
@@ -94,9 +99,7 @@ class MoEHFLM(HFLM):
         )
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-            encoding["attention_mask"] = encoding["attention_mask"][
-                :, -left_truncate_len:
-            ]
         self.tokenizer.padding_side = old_padding_side
         return encoding["input_ids"], encoding["attention_mask"]

 from lm_eval.models.huggingface import HFLM
 from lm_eval.api.registry import register_model
+@register_model("moe-infinity")
 class MoEHFLM(HFLM):
     def __init__(
         self,
         pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
         moe_config: dict = None,
+        offload_path=os.path.expanduser("~"),
+        device_memory_ratio=0.75,
         use_chat_template=True,
         *args,
+        **kwargs,
     ):
         # Initialize parent class without calling _create_model in the parent's __init__
         self.checkpoint = pretrained
         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
+        super().__init__(
+            *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
+        )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
     def _create_model(self, *args, **kwargs):
         # Update default config with any user-provided config
         final_moe_config = {**default_moe_config, **self.moe_config}
         # self._model = MoE(self.checkpoint, final_moe_config)
+        self._model = AutoModelForCausalLM.from_pretrained(
+            self.checkpoint, torch_dtype=torch.float16, device_map="auto"
+        )
     @property
     def max_length(self):
         )
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
         self.tokenizer.padding_side = old_padding_side
         return encoding["input_ids"], encoding["attention_mask"]

src/backend/run_eval_suite.py CHANGED Viewed

@@ -14,7 +14,17 @@ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
 from src.backend.huggingface_generate_until import HFLMwithChatTemplate
 from src.backend.moe_infinity import MoEHFLM
-def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
@@ -33,30 +43,34 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
     print(f"Selected Tasks: {task_names}")
     print(f"Eval Request: {eval_request.get_model_args()}")
-    print(f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}")
     # hf-chat is implemented to use apply_chat_template
-    results = evaluator.simple_evaluate(model="moe-infinity",  # "hf-causal-experimental",  # "hf-causal", hf-chat
-                                        model_args=eval_request.get_model_args(),
-                                        tasks=task_names,
-                                        num_fewshot=num_fewshot,
-                                        batch_size=batch_size,
-                                        max_batch_size=8,
-                                        device=device,
-                                        use_cache=use_cache,
-                                        limit=limit,
-                                        write_out=True,
-                                        task_manager=task_manager)
     results["config"]["model_dtype"] = eval_request.precision
     results["config"]["model_name"] = eval_request.model
     results["config"]["model_sha"] = eval_request.revision
     if max_nb_samples is not None:
-        if 'samples' in results:
-            samples = results['samples']
             for task_name in samples.keys():
                 if len(samples[task_name]) > max_nb_samples:
-                    results['samples'][task_name] = results['samples'][task_name][:max_nb_samples]
     # print(evaluator.make_table(results))

 from src.backend.huggingface_generate_until import HFLMwithChatTemplate
 from src.backend.moe_infinity import MoEHFLM
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names,
+    num_fewshot,
+    batch_size,
+    device,
+    use_cache=None,
+    limit=None,
+    max_nb_samples=100,
+) -> dict:
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
     print(f"Selected Tasks: {task_names}")
     print(f"Eval Request: {eval_request.get_model_args()}")
+    print(
+        f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}"
+    )
     # hf-chat is implemented to use apply_chat_template
+    results = evaluator.simple_evaluate(
+        model="moe-infinity",  # "hf-causal-experimental",  # "hf-causal", hf-chat
+        model_args=eval_request.get_model_args(),
+        tasks=task_names,
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        max_batch_size=8,
+        device=device,
+        use_cache=use_cache,
+        limit=limit,
+        write_out=True,
+        task_manager=task_manager,
+    )
     results["config"]["model_dtype"] = eval_request.precision
     results["config"]["model_name"] = eval_request.model
     results["config"]["model_sha"] = eval_request.revision
     if max_nb_samples is not None:
+        if "samples" in results:
+            samples = results["samples"]
             for task_name in samples.keys():
                 if len(samples[task_name]) > max_nb_samples:
+                    results["samples"][task_name] = results["samples"][task_name][:max_nb_samples]
     # print(evaluator.make_table(results))

src/backend/tasks/cnndm/task.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
@@ -66,7 +67,7 @@ class CNNDM(ConfigurableTask):
     DATASET_NAME = "3.0.0"
     def __init__(self):
-        super().__init__(config={'metadata': {'version': self.VERSION}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
@@ -74,12 +75,18 @@ class CNNDM(ConfigurableTask):
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
-            self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
-            self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
@@ -125,15 +132,7 @@ class CNNDM(ConfigurableTask):
             part of the document for `doc`.
         """
-        return [
-            Instance(
-                request_type="generate_until",
-                doc=doc,
-                arguments=(ctx, {"until": ["\n"]}),
-                idx=0,
-                **kwargs
-            )
-        ]
     def process_results(self, doc, results):
         completion = results[0]
@@ -157,12 +156,16 @@ class CNNDM(ConfigurableTask):
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
-        factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
-        bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
         res = {
             "rouge1": rouge1_scores[0],
@@ -171,7 +174,7 @@ class CNNDM(ConfigurableTask):
             "factKB": float(factkb_res[0][1]),
             "bertscore_precision": float(bert_score_res["precision"][0]),
             "bertscore_recall": float(bert_score_res["recall"][0]),
-            "bertscore_f1": float(bert_score_res["f1"][0])
         }
         return res
@@ -182,7 +185,18 @@ class CNNDM(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
     def higher_is_better(self):
         """
@@ -190,5 +204,15 @@ class CNNDM(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}

 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
     DATASET_NAME = "3.0.0"
     def __init__(self):
+        super().__init__(config={"metadata": {"version": self.VERSION}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
+            self.factkb_tokenizer = AutoTokenizer.from_pretrained(
+                "roberta-base", padding="max_length", truncation=True
+            )
+            self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
+                "bunsenfeng/FactKB", num_labels=2, device_map="auto"
+            )
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
             part of the document for `doc`.
         """
+        return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
     def process_results(self, doc, results):
         completion = results[0]
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
+        factkb_tokens = self.factkb_tokenizer(
+            input_factkb, return_tensors="pt", padding="max_length", truncation=True
+        ).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
+        bert_score_res = self.bert_score.compute(
+            predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
+        )
         res = {
             "rouge1": rouge1_scores[0],
             "factKB": float(factkb_res[0][1]),
             "bertscore_precision": float(bert_score_res["precision"][0]),
             "bertscore_recall": float(bert_score_res["recall"][0]),
+            "bertscore_f1": float(bert_score_res["f1"][0]),
         }
         return res
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
+        return {
+            k: mean
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }
     def higher_is_better(self):
         """
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
+        return {
+            k: True
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }

src/backend/tasks/cnndm/task_v2.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
@@ -66,8 +67,12 @@ class CNNDMv2(ConfigurableTask):
     DATASET_NAME = "3.0.0"
     def __init__(self):
-        super().__init__(config={'metadata': {'version': self.VERSION},
-                                 'generation_kwargs': {'do_sample': False, 'temperature': 0.0, 'until': ['\n', '\n\n']}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
@@ -75,12 +80,18 @@ class CNNDMv2(ConfigurableTask):
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
-            self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
-            self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
@@ -134,15 +145,7 @@ class CNNDMv2(ConfigurableTask):
             part of the document for `doc`.
         """
-        return [
-            Instance(
-                request_type="generate_until",
-                doc=doc,
-                arguments=(ctx, {"until": ["\n"]}),
-                idx=0,
-                **kwargs
-            )
-        ]
     def process_results(self, doc, results):
         completion = results[0]
@@ -166,12 +169,16 @@ class CNNDMv2(ConfigurableTask):
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
-        factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
-        bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
         res = {
             "rouge1": rouge1_scores[0],
@@ -180,7 +187,7 @@ class CNNDMv2(ConfigurableTask):
             "factKB": float(factkb_res[0][1]),
             "bertscore_precision": float(bert_score_res["precision"][0]),
             "bertscore_recall": float(bert_score_res["recall"][0]),
-            "bertscore_f1": float(bert_score_res["f1"][0])
         }
         return res
@@ -191,7 +198,18 @@ class CNNDMv2(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
     def higher_is_better(self):
         """
@@ -199,5 +217,15 @@ class CNNDMv2(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}

 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
     DATASET_NAME = "3.0.0"
     def __init__(self):
+        super().__init__(
+            config={
+                "metadata": {"version": self.VERSION},
+                "generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
+            }
+        )
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
+            self.factkb_tokenizer = AutoTokenizer.from_pretrained(
+                "roberta-base", padding="max_length", truncation=True
+            )
+            self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
+                "bunsenfeng/FactKB", num_labels=2, device_map="auto"
+            )
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
             part of the document for `doc`.
         """
+        return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
     def process_results(self, doc, results):
         completion = results[0]
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
+        factkb_tokens = self.factkb_tokenizer(
+            input_factkb, return_tensors="pt", padding="max_length", truncation=True
+        ).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
+        bert_score_res = self.bert_score.compute(
+            predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
+        )
         res = {
             "rouge1": rouge1_scores[0],
             "factKB": float(factkb_res[0][1]),
             "bertscore_precision": float(bert_score_res["precision"][0]),
             "bertscore_recall": float(bert_score_res["recall"][0]),
+            "bertscore_f1": float(bert_score_res["f1"][0]),
         }
         return res
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
+        return {
+            k: mean
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }
     def higher_is_better(self):
         """
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
+        return {
+            k: True
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }

src/backend/tasks/faithdial/utils.py CHANGED Viewed

@@ -1,15 +1,16 @@
 from typing import List, Union
 ValueType = Union[str, List[str]]
 def doc_to_text(doc: dict[str, ValueType]) -> str:
-    history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
     doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
     return doc_text
 def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
-    history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
     doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
     return doc_text

 from typing import List, Union
 ValueType = Union[str, List[str]]
 def doc_to_text(doc: dict[str, ValueType]) -> str:
+    history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
     doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
     return doc_text
 def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
+    history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
     doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
     return doc_text

src/backend/tasks/halueval/utils.py CHANGED Viewed

@@ -83,13 +83,31 @@ You should try your best to determine if the summary contains non-factual or hal
 def doc_to_text_qa(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
-    doc_text = QA_INSTURCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
-    doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
     return doc_text
@@ -103,7 +121,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
 def doc_to_target(doc: dict[str, str]) -> str:
-    return doc['hallucination']
 def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:

 def doc_to_text_qa(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
+    doc_text = (
+        QA_INSTURCTIONS
+        + "\n\n#Knowledge#: "
+        + doc["knowledge"]
+        + "\n#Question#: "
+        + doc["question"]
+        + "\n#Answer#: "
+        + doc["answer"]
+        + "\n#Your Judgement#:"
+    )
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
+    doc_text = (
+        DIALOGUE_INSTRUCTIONS
+        + "\n\n#Knowledge#: "
+        + doc["knowledge"]
+        + "\n#Dialogue History#: "
+        + doc["dialogue_history"]
+        + "\n#Response#: "
+        + doc["response"]
+        + "\n#Your Judgement#:"
+    )
     return doc_text
 def doc_to_target(doc: dict[str, str]) -> str:
+    return doc["hallucination"]
 def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Union, List
 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
@@ -17,26 +18,31 @@ class SelfCheckGPT(ConfigurableTask):
     VERSION = 0.0
     DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
     DATASET_NAME = None
-    OUTPUT_TYPE = 'generate_until'
     def __init__(self):
-        super().__init__(config={'metadata': {'version': self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
-        self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
-        self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
-        self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
         self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
-        if self.selfcheckgpt_type == 'SelfCheckNgram':
             self.selfcheckgpt = SelfCheckNgram(n=1)
-        elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
             self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
-        elif self.selfcheckgpt_type == 'SelfCheckMQAG':
             self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
-        elif self.selfcheckgpt_type == 'SelfCheckNLI':
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
         self.SelfCheckNLI_error_cnt = 0
@@ -53,10 +59,10 @@ class SelfCheckGPT(ConfigurableTask):
         return self.dataset["evaluation"]
     def doc_to_text(self, doc):
-        if not hasattr(self, 'selfcheckgpt_nlp'):
             self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
-        sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc['wiki_bio_text']).sents]
         if len(sentences) < 2:
             raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
             # disscussed with Potsawee
@@ -65,18 +71,19 @@ class SelfCheckGPT(ConfigurableTask):
         return doc_text
     def doc_to_target(self, doc):
-        answer = doc['wiki_bio_text']
         return answer
     def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
         arguments = (ctx, self.generation_kwargs)
         request_list = [
-            Instance(request_type='generate_until', doc=doc, arguments=arguments, idx=0, **kwargs),
         ]
         sampling_arguments = (ctx, self.generation_kwargs_sampling)
-        request_list.extend([
-            Instance(request_type='generate_until', doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
-            for idx in range(1, self.generation_kwargs_sampling_number+1)
             ]
         )
         return request_list
@@ -88,48 +95,53 @@ class SelfCheckGPT(ConfigurableTask):
         sentences = self.selfcheckgpt_nlp(response_temperature_0)
         sentences = [sent.text.strip() for sent in sentences.sents]
-        if self.selfcheckgpt_type == 'SelfCheckNgram':
-            selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses)
             return {
-                'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
-                'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']
             }
-        elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
-        elif self.selfcheckgpt_type == 'SelfCheckMQAG':
             selfcheckgpt_scores = self.selfcheckgpt.predict(
                 sentences=sentences,
                 passage=response_temperature_0,
                 sampled_passages=other_responses,
-                num_questions_per_sent=5,          # number of questions to be drawn
-                scoring_method='bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
-                beta1=0.8, beta2=0.8)            # additional params depending on scoring_method
-        elif self.selfcheckgpt_type == 'SelfCheckNLI':
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
             if len(selfcheckgpt_scores) < 2:
                 # at least two sentences
                 self.SelfCheckNLI_error_cnt += 1
-                result = {
-                    'avg-selfcheckgpt': 0.0,
-                    'max-selfcheckgpt': 0.0
-                }
             else:
-                threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
                 # passage is hallucianted if one sentence is hallucinated. It's very strict.
                 selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
                 # passage is hallucianted if average score of all sentences is hallucinated.
-                selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
-                result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
             return result
-        selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
         selfcheckgpt_scores_max = max(selfcheckgpt_scores)
-        return {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
     def aggregation(self):
         """

 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
     VERSION = 0.0
     DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
     DATASET_NAME = None
+    OUTPUT_TYPE = "generate_until"
     def __init__(self):
+        super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
+        self.generation_kwargs_sampling = {
+            "temperature": 0.99,
+            "do_sample": True,
+            "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"],
+            "max_length": 512,
+        }
+        self.selfcheckgpt_type = os.environ.get("SELFCHECKGPTTYPE", "SelfCheckNLI")
+        self.selfcheckgpt_device = os.environ.get("SELFCHECKGPTDEVICE", DEVICE)
         self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
+        if self.selfcheckgpt_type == "SelfCheckNgram":
             self.selfcheckgpt = SelfCheckNgram(n=1)
+        elif self.selfcheckgpt_type == "SelfCheckBERTScore":
             self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
+        elif self.selfcheckgpt_type == "SelfCheckMQAG":
             self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
+        elif self.selfcheckgpt_type == "SelfCheckNLI":
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
         self.SelfCheckNLI_error_cnt = 0
         return self.dataset["evaluation"]
     def doc_to_text(self, doc):
+        if not hasattr(self, "selfcheckgpt_nlp"):
             self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
+        sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc["wiki_bio_text"]).sents]
         if len(sentences) < 2:
             raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
             # disscussed with Potsawee
         return doc_text
     def doc_to_target(self, doc):
+        answer = doc["wiki_bio_text"]
         return answer
     def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
         arguments = (ctx, self.generation_kwargs)
         request_list = [
+            Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
         ]
         sampling_arguments = (ctx, self.generation_kwargs_sampling)
+        request_list.extend(
+            [
+                Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
+                for idx in range(1, self.generation_kwargs_sampling_number + 1)
             ]
         )
         return request_list
         sentences = self.selfcheckgpt_nlp(response_temperature_0)
         sentences = [sent.text.strip() for sent in sentences.sents]
+        if self.selfcheckgpt_type == "SelfCheckNgram":
+            selfcheckgpt_scores = self.selfcheckgpt.predict(
+                sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses
+            )
             return {
+                "avg-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_neg_logprob"],
+                "max-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_max_neg_logprob"],
             }
+        elif self.selfcheckgpt_type == "SelfCheckBERTScore":
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
+        elif self.selfcheckgpt_type == "SelfCheckMQAG":
             selfcheckgpt_scores = self.selfcheckgpt.predict(
                 sentences=sentences,
                 passage=response_temperature_0,
                 sampled_passages=other_responses,
+                num_questions_per_sent=5,  # number of questions to be drawn
+                scoring_method="bayes_with_alpha",  # options = 'counting', 'bayes', 'bayes_with_alpha'
+                beta1=0.8,
+                beta2=0.8,
+            )  # additional params depending on scoring_method
+        elif self.selfcheckgpt_type == "SelfCheckNLI":
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
             if len(selfcheckgpt_scores) < 2:
                 # at least two sentences
                 self.SelfCheckNLI_error_cnt += 1
+                result = {"avg-selfcheckgpt": 0.0, "max-selfcheckgpt": 0.0}
             else:
+                threshold = 0.7  # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
                 # passage is hallucianted if one sentence is hallucinated. It's very strict.
                 selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
                 # passage is hallucianted if average score of all sentences is hallucinated.
+                selfcheckgpt_scores_avg = (
+                    0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
+                )
+                result = {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
             return result
+        selfcheckgpt_scores_avg = (
+            sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
+        )
         selfcheckgpt_scores_max = max(selfcheckgpt_scores)
+        return {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
     def aggregation(self):
         """

src/backend/tasks/xsum/task.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
@@ -18,8 +19,16 @@ def bleu(refs, preds):
     :param preds:
         A `list` of predicted `str`s.
     """
-    score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
-                                  lowercase=False, tokenize="intl", use_effective_order=False).score
     return score
@@ -58,7 +67,7 @@ class XSum(ConfigurableTask):
     DATASET_NAME = None
     def __init__(self):
-        super().__init__(config={'metadata': {'version': self.VERSION}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
@@ -66,12 +75,18 @@ class XSum(ConfigurableTask):
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
-            self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
-            self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
@@ -124,7 +139,7 @@ class XSum(ConfigurableTask):
                 # arguments=(ctx, {"until": ["\n", "."]}),
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
-                **kwargs
             )
         ]
@@ -150,12 +165,16 @@ class XSum(ConfigurableTask):
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
-        factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
-        bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
         res = {
             "rouge1": rouge1_scores[0],
@@ -177,7 +196,18 @@ class XSum(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
     def higher_is_better(self):
         """
@@ -185,4 +215,15 @@ class XSum(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}

 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
     :param preds:
         A `list` of predicted `str`s.
     """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
     return score
     DATASET_NAME = None
     def __init__(self):
+        super().__init__(config={"metadata": {"version": self.VERSION}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
+            self.factkb_tokenizer = AutoTokenizer.from_pretrained(
+                "roberta-base", padding="max_length", truncation=True
+            )
+            self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
+                "bunsenfeng/FactKB", num_labels=2, device_map="auto"
+            )
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
                 # arguments=(ctx, {"until": ["\n", "."]}),
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
+                **kwargs,
             )
         ]
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
+        factkb_tokens = self.factkb_tokenizer(
+            input_factkb, return_tensors="pt", padding="max_length", truncation=True
+        ).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
+        bert_score_res = self.bert_score.compute(
+            predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
+        )
         res = {
             "rouge1": rouge1_scores[0],
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
+        return {
+            k: mean
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }
     def higher_is_better(self):
         """
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
+        return {
+            k: True
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }

src/backend/tasks/xsum/task_v2.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
@@ -18,8 +19,16 @@ def bleu(refs, preds):
     :param preds:
         A `list` of predicted `str`s.
     """
-    score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
-                                  lowercase=False, tokenize="intl", use_effective_order=False).score
     return score
@@ -59,8 +68,12 @@ class XSumv2(ConfigurableTask):
     def __init__(self):
         # breakpoint()
-        super().__init__(config={'metadata': {'version': self.VERSION},
-                                 'generation_kwargs': {'do_sample': False, 'temperature': 0.0, 'until': ['\n', '\n\n']}})
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
@@ -68,12 +81,18 @@ class XSumv2(ConfigurableTask):
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
-            self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
-            self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
@@ -129,7 +148,7 @@ class XSumv2(ConfigurableTask):
                 # arguments=(ctx, {"until": ["\n", "."]}),
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
-                **kwargs
             )
         ]
@@ -155,12 +174,16 @@ class XSumv2(ConfigurableTask):
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
-        factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
-        bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
         res = {
             "rouge1": rouge1_scores[0],
@@ -182,7 +205,18 @@ class XSumv2(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
     def higher_is_better(self):
         """
@@ -190,4 +224,15 @@ class XSumv2(ConfigurableTask):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}

 from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
 # from lm_eval.api.registry import register_task
 from lm_eval.api.metrics import mean
     :param preds:
         A `list` of predicted `str`s.
     """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
     return score
     def __init__(self):
         # breakpoint()
+        super().__init__(
+            config={
+                "metadata": {"version": self.VERSION},
+                "generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
+            }
+        )
         self.factkb_tokenizer = None
         self.factkb_model = None
         self.bert_score = None
     def maybe_init_factkb(self):
         if self.factkb_tokenizer is None or self.factkb_model is None:
             from transformers import AutoTokenizer, AutoModelForSequenceClassification
+            self.factkb_tokenizer = AutoTokenizer.from_pretrained(
+                "roberta-base", padding="max_length", truncation=True
+            )
+            self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
+                "bunsenfeng/FactKB", num_labels=2, device_map="auto"
+            )
     def maybe_init_bertscore(self):
         if self.bert_score is None:
             from evaluate import load
             self.bert_score = load("bertscore")
     def has_training_docs(self):
                 # arguments=(ctx, {"until": ["\n", "."]}),
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
+                **kwargs,
             )
         ]
         self.maybe_init_factkb()
         input_factkb = [[completion, document]]
+        factkb_tokens = self.factkb_tokenizer(
+            input_factkb, return_tensors="pt", padding="max_length", truncation=True
+        ).to(self.factkb_model.device)
         factkb_logits = self.factkb_model(**factkb_tokens).logits
         factkb_res = torch.softmax(factkb_logits, dim=1)
         self.maybe_init_bertscore()
+        bert_score_res = self.bert_score.compute(
+            predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
+        )
         res = {
             "rouge1": rouge1_scores[0],
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
+        return {
+            k: mean
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }
     def higher_is_better(self):
         """
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
+        return {
+            k: True
+            for k in [
+                "rouge1",
+                "rouge2",
+                "rougeL",
+                "factKB",
+                "bertscore_precision",
+                "bertscore_recall",
+                "bertscore_f1",
+            ]
+        }

src/browse.py CHANGED Viewed

@@ -32,6 +32,7 @@ import socket
 import subprocess
 import sys
 import webbrowser
 if sys.version_info >= (3, 2):
     from html import escape
 else:
@@ -42,7 +43,7 @@ except ImportError:
     from urllib2 import unquote
 from collections import namedtuple
-Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
 # Ideally we'd allow you to navigate to a build edge or a build node,
 # with appropriate views for each.  But there's no way to *name* a build
@@ -57,16 +58,19 @@ Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
 # This means there's no single view that shows you all inputs and outputs
 # of an edge.  But I think it's less confusing than alternatives.
 def match_strip(line, prefix):
     if not line.startswith(prefix):
         return (False, line)
-    return (True, line[len(prefix):])
 def html_escape(text):
     return escape(text, quote=True)
 def parse(text):
-    lines = iter(text.split('\n'))
     target = None
     rule = None
@@ -77,33 +81,35 @@ def parse(text):
         target = next(lines)[:-1]  # strip trailing colon
         line = next(lines)
-        (match, rule) = match_strip(line, '  input: ')
         if match:
-            (match, line) = match_strip(next(lines), '    ')
             while match:
                 type = None
-                (match, line) = match_strip(line, '| ')
                 if match:
-                    type = 'implicit'
-                (match, line) = match_strip(line, '|| ')
                 if match:
-                    type = 'order-only'
                 inputs.append((line, type))
-                (match, line) = match_strip(next(lines), '    ')
-        match, _ = match_strip(line, '  outputs:')
         if match:
-            (match, line) = match_strip(next(lines), '    ')
             while match:
                 outputs.append(line)
-                (match, line) = match_strip(next(lines), '    ')
     except StopIteration:
         pass
     return Node(inputs, rule, target, outputs)
 def create_page(body):
-    return '''<!DOCTYPE html>
 <style>
 body {
     font-family: sans;
@@ -128,52 +134,55 @@ tt {
   -webkit-columns: auto 2;
 }
 </style>
-''' + body
 def generate_html(node):
-    document = ['<h1><tt>%s</tt></h1>' % html_escape(node.target)]
     if node.inputs:
-        document.append('<h2>target is built using rule <tt>%s</tt> of</h2>' %
-                        html_escape(node.rule))
         if len(node.inputs) > 0:
-            document.append('<div class=filelist>')
             for input, type in sorted(node.inputs):
-                extra = ''
                 if type:
-                    extra = ' (%s)' % html_escape(type)
-                document.append('<tt><a href="?%s">%s</a>%s</tt><br>' %
-                                (html_escape(input), html_escape(input), extra))
-            document.append('</div>')
     if node.outputs:
-        document.append('<h2>dependent edges build:</h2>')
-        document.append('<div class=filelist>')
         for output in sorted(node.outputs):
-            document.append('<tt><a href="?%s">%s</a></tt><br>' %
-                            (html_escape(output), html_escape(output)))
-        document.append('</div>')
-    return '\n'.join(document)
 def ninja_dump(target):
-    cmd = [args.ninja_command, '-f', args.f, '-t', 'query', target]
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                            universal_newlines=True)
     return proc.communicate() + (proc.returncode,)
 class RequestHandler(httpserver.BaseHTTPRequestHandler):
     def do_GET(self):
-        assert self.path[0] == '/'
         target = unquote(self.path[1:])
-        if target == '':
             self.send_response(302)
-            self.send_header('Location', '?' + args.initial_target)
             self.end_headers()
             return
-        if not target.startswith('?'):
             self.send_response(404)
             self.end_headers()
             return
@@ -184,48 +193,45 @@ class RequestHandler(httpserver.BaseHTTPRequestHandler):
             page_body = generate_html(parse(ninja_output.strip()))
         else:
             # Relay ninja's error message.
-            page_body = '<h1><tt>%s</tt></h1>' % html_escape(ninja_error)
         self.send_response(200)
         self.end_headers()
-        self.wfile.write(create_page(page_body).encode('utf-8'))
     def log_message(self, format, *args):
         pass  # Swallow console spam.
-parser = argparse.ArgumentParser(prog='ninja -t browse')
-parser.add_argument('--port', '-p', default=8000, type=int,
-    help='Port number to use (default %(default)d)')
-parser.add_argument('--hostname', '-a', default='localhost', type=str,
-    help='Hostname to bind to (default %(default)s)')
-parser.add_argument('--no-browser', action='store_true',
-    help='Do not open a webbrowser on startup.')
-parser.add_argument('--ninja-command', default='ninja',
-    help='Path to ninja binary (default %(default)s)')
-parser.add_argument('-f', default='build.ninja',
-    help='Path to build.ninja file (default %(default)s)')
-parser.add_argument('initial_target', default='all', nargs='?',
-    help='Initial target to show (default %(default)s)')
 class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
     # terminate server immediately when Python exits.
     daemon_threads = True
 args = parser.parse_args()
 port = args.port
 hostname = args.hostname
-httpd = HTTPServer((hostname,port), RequestHandler)
 try:
     if hostname == "":
         hostname = socket.gethostname()
-    print('Web server running on %s:%d, ctl-C to abort...' % (hostname,port) )
-    print('Web server pid %d' % os.getpid(), file=sys.stderr )
     if not args.no_browser:
-        webbrowser.open_new('http://%s:%s' % (hostname, port) )
     httpd.serve_forever()
 except KeyboardInterrupt:
     print()
     pass  # Swallow console spam.

 import subprocess
 import sys
 import webbrowser
 if sys.version_info >= (3, 2):
     from html import escape
 else:
     from urllib2 import unquote
 from collections import namedtuple
+Node = namedtuple("Node", ["inputs", "rule", "target", "outputs"])
 # Ideally we'd allow you to navigate to a build edge or a build node,
 # with appropriate views for each.  But there's no way to *name* a build
 # This means there's no single view that shows you all inputs and outputs
 # of an edge.  But I think it's less confusing than alternatives.
 def match_strip(line, prefix):
     if not line.startswith(prefix):
         return (False, line)
+    return (True, line[len(prefix) :])
 def html_escape(text):
     return escape(text, quote=True)
 def parse(text):
+    lines = iter(text.split("\n"))
     target = None
     rule = None
         target = next(lines)[:-1]  # strip trailing colon
         line = next(lines)
+        (match, rule) = match_strip(line, "  input: ")
         if match:
+            (match, line) = match_strip(next(lines), "    ")
             while match:
                 type = None
+                (match, line) = match_strip(line, "| ")
                 if match:
+                    type = "implicit"
+                (match, line) = match_strip(line, "|| ")
                 if match:
+                    type = "order-only"
                 inputs.append((line, type))
+                (match, line) = match_strip(next(lines), "    ")
+        match, _ = match_strip(line, "  outputs:")
         if match:
+            (match, line) = match_strip(next(lines), "    ")
             while match:
                 outputs.append(line)
+                (match, line) = match_strip(next(lines), "    ")
     except StopIteration:
         pass
     return Node(inputs, rule, target, outputs)
 def create_page(body):
+    return (
+        """<!DOCTYPE html>
 <style>
 body {
     font-family: sans;
   -webkit-columns: auto 2;
 }
 </style>
+"""
+        + body
+    )
 def generate_html(node):
+    document = ["<h1><tt>%s</tt></h1>" % html_escape(node.target)]
     if node.inputs:
+        document.append("<h2>target is built using rule <tt>%s</tt> of</h2>" % html_escape(node.rule))
         if len(node.inputs) > 0:
+            document.append("<div class=filelist>")
             for input, type in sorted(node.inputs):
+                extra = ""
                 if type:
+                    extra = " (%s)" % html_escape(type)
+                document.append(
+                    '<tt><a href="?%s">%s</a>%s</tt><br>' % (html_escape(input), html_escape(input), extra)
+                )
+            document.append("</div>")
     if node.outputs:
+        document.append("<h2>dependent edges build:</h2>")
+        document.append("<div class=filelist>")
         for output in sorted(node.outputs):
+            document.append('<tt><a href="?%s">%s</a></tt><br>' % (html_escape(output), html_escape(output)))
+        document.append("</div>")
+    return "\n".join(document)
 def ninja_dump(target):
+    cmd = [args.ninja_command, "-f", args.f, "-t", "query", target]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
     return proc.communicate() + (proc.returncode,)
 class RequestHandler(httpserver.BaseHTTPRequestHandler):
     def do_GET(self):
+        assert self.path[0] == "/"
         target = unquote(self.path[1:])
+        if target == "":
             self.send_response(302)
+            self.send_header("Location", "?" + args.initial_target)
             self.end_headers()
             return
+        if not target.startswith("?"):
             self.send_response(404)
             self.end_headers()
             return
             page_body = generate_html(parse(ninja_output.strip()))
         else:
             # Relay ninja's error message.
+            page_body = "<h1><tt>%s</tt></h1>" % html_escape(ninja_error)
         self.send_response(200)
         self.end_headers()
+        self.wfile.write(create_page(page_body).encode("utf-8"))
     def log_message(self, format, *args):
         pass  # Swallow console spam.
+parser = argparse.ArgumentParser(prog="ninja -t browse")
+parser.add_argument("--port", "-p", default=8000, type=int, help="Port number to use (default %(default)d)")
+parser.add_argument(
+    "--hostname", "-a", default="localhost", type=str, help="Hostname to bind to (default %(default)s)"
+)
+parser.add_argument("--no-browser", action="store_true", help="Do not open a webbrowser on startup.")
+parser.add_argument("--ninja-command", default="ninja", help="Path to ninja binary (default %(default)s)")
+parser.add_argument("-f", default="build.ninja", help="Path to build.ninja file (default %(default)s)")
+parser.add_argument("initial_target", default="all", nargs="?", help="Initial target to show (default %(default)s)")
 class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
     # terminate server immediately when Python exits.
     daemon_threads = True
 args = parser.parse_args()
 port = args.port
 hostname = args.hostname
+httpd = HTTPServer((hostname, port), RequestHandler)
 try:
     if hostname == "":
         hostname = socket.gethostname()
+    print("Web server running on %s:%d, ctl-C to abort..." % (hostname, port))
+    print("Web server pid %d" % os.getpid(), file=sys.stderr)
     if not args.no_browser:
+        webbrowser.open_new("http://%s:%s" % (hostname, port))
     httpd.serve_forever()
 except KeyboardInterrupt:
     print()
     pass  # Swallow console spam.

src/display/utils.py CHANGED Viewed

@@ -61,6 +61,7 @@ class ColumnContent:
     never_hidden: bool = False
     dummy: bool = False
 auto_eval_column_dict = []
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])

     never_hidden: bool = False
     dummy: bool = False
 auto_eval_column_dict = []
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])

src/leaderboard/filter_models.py CHANGED Viewed

@@ -29,9 +29,9 @@ def flag_models(leaderboard_data: list[dict]):
                 FLAGGED_MODELS[model_data["model_name_for_query"]],
                 f"See discussion #{issue_num}",
             )
-            model_data[
-                AutoEvalColumn.model.name
-            ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
 def remove_forbidden_models(leaderboard_data: list[dict]):

                 FLAGGED_MODELS[model_data["model_name_for_query"]],
                 f"See discussion #{issue_num}",
             )
+            model_data[AutoEvalColumn.model.name] = (
+                f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
+            )
 def remove_forbidden_models(leaderboard_data: list[dict]):

src/leaderboard/read_evals.py CHANGED Viewed

@@ -5,6 +5,7 @@ from tqdm import tqdm
 from dataclasses import dataclass
 import dateutil
 # import numpy as np
 from src.display.formatting import make_clickable_model
@@ -32,13 +33,13 @@ class EvalResult:
     revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @staticmethod
@@ -67,7 +68,9 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
-        still_on_hub, error, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
@@ -79,35 +82,43 @@ class EvalResult:
         # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
         results = {}
-        for benchmark, benchmark_results in data['results'].items():
             if benchmark not in results:
                 results[benchmark] = {}
             for metric, value in benchmark_results.items():
                 to_add = True
-                if '_stderr' in metric:
                     to_add = False
-                if 'alias' in metric:
                     to_add = False
-                if ',' in metric:
-                    metric = metric.split(',')[0]
                 metric = metric.replace("exact_match", "em")
                 if to_add is True:
                     multiplier = 100.0
-                    if 'rouge' in metric and 'truthful' not in benchmark:
                         multiplier = 1.0
-                    if 'squad' in benchmark:
                         multiplier = 1.0
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
                     results[benchmark][metric] = value * multiplier
-        res = EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
-                         precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
-                         architecture=architecture)
         return res
@@ -183,6 +194,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
                 request_file = tmp_request_file
     return request_file
 def get_request_file_for_model_open_llm(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
@@ -197,16 +209,16 @@ def get_request_file_for_model_open_llm(requests_path, model_name, precision):
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
                 request_file = tmp_request_file
     return request_file
 def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
     """Finds the relevant request file for the current model and updates info with it"""
-    request_file = get_request_file_for_model_open_llm(open_llm_requests_path, result.full_model, result.precision.value.name)
     if request_file:
         try:
@@ -219,9 +231,8 @@ def update_model_type_with_open_llm_request_file(result, open_llm_requests_path)
             pass
     return result
-def get_raw_eval_results(results_path: str,
-                         requests_path: str,
-                         is_backend: bool = False) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []

 from dataclasses import dataclass
 import dateutil
 # import numpy as np
 from src.display.formatting import make_clickable_model
     revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"  # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @staticmethod
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
+        still_on_hub, error, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
         # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
         results = {}
+        for benchmark, benchmark_results in data["results"].items():
             if benchmark not in results:
                 results[benchmark] = {}
             for metric, value in benchmark_results.items():
                 to_add = True
+                if "_stderr" in metric:
                     to_add = False
+                if "alias" in metric:
                     to_add = False
+                if "," in metric:
+                    metric = metric.split(",")[0]
                 metric = metric.replace("exact_match", "em")
                 if to_add is True:
                     multiplier = 100.0
+                    if "rouge" in metric and "truthful" not in benchmark:
                         multiplier = 1.0
+                    if "squad" in benchmark:
                         multiplier = 1.0
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
                     results[benchmark][metric] = value * multiplier
+        res = EvalResult(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture,
+        )
         return res
                 request_file = tmp_request_file
     return request_file
 def get_request_file_for_model_open_llm(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
                 request_file = tmp_request_file
     return request_file
 def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
     """Finds the relevant request file for the current model and updates info with it"""
+    request_file = get_request_file_for_model_open_llm(
+        open_llm_requests_path, result.full_model, result.precision.value.name
+    )
     if request_file:
         try:
             pass
     return result
+def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []

src/populate.py CHANGED Viewed

@@ -13,17 +13,21 @@ from src.backend.envs import Tasks as BackendTasks
 from src.display.utils import Tasks
-def get_leaderboard_df(results_path: str,
-                       requests_path: str,
-                       requests_path_open_llm: str,
-                       cols: list,
-                       benchmark_cols: list,
-                       is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
     # Returns a list of EvalResult
     raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
     if requests_path_open_llm != "":
         for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
-            raw_data[result_idx] = update_model_type_with_open_llm_request_file(raw_data[result_idx], requests_path_open_llm)
     all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]

 from src.display.utils import Tasks
+def get_leaderboard_df(
+    results_path: str,
+    requests_path: str,
+    requests_path_open_llm: str,
+    cols: list,
+    benchmark_cols: list,
+    is_backend: bool = False,
+) -> tuple[list[EvalResult], pd.DataFrame]:
     # Returns a list of EvalResult
     raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
     if requests_path_open_llm != "":
         for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
+            raw_data[result_idx] = update_model_type_with_open_llm_request_file(
+                raw_data[result_idx], requests_path_open_llm
+            )
     all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]

src/submission/check_validity.py CHANGED Viewed

@@ -40,20 +40,34 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, Optional[str], Optional[AutoConfig]]:
     try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
-                AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return False, f"uses a tokenizer which is not in a transformers release: {e}", None
             except Exception as e:
-                return False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None
         return True, None, config
     except ValueError as e:
-        return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", None
     except Exception as e:
         return False, f"was not found on hub -- {str(e)}", None
@@ -63,7 +77,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError ):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
@@ -75,9 +89,11 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")
 def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
     if org_or_user not in users_to_submission_dates:
         return True, ""

     return True, ""
+def is_model_on_hub(
+    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
+) -> tuple[bool, Optional[str], Optional[AutoConfig]]:
     try:
+        config = AutoConfig.from_pretrained(
+            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+        )
         if test_tokenizer:
             try:
+                AutoTokenizer.from_pretrained(
+                    model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+                )
             except ValueError as e:
                 return False, f"uses a tokenizer which is not in a transformers release: {e}", None
             except Exception as e:
+                return (
+                    False,
+                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
+                    None,
+                )
         return True, None, config
     except ValueError as e:
+        return (
+            False,
+            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None,
+        )
     except Exception as e:
         return False, f"was not found on hub -- {str(e)}", None
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")
 def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
     if org_or_user not in users_to_submission_dates:
         return True, ""

src/submission/submit.py CHANGED Viewed

@@ -61,7 +61,9 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')

     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False
+        )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')

src/utils.py CHANGED Viewed

@@ -5,18 +5,21 @@ from huggingface_hub import snapshot_download
 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
         try:
-            snapshot_download(repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers)
             return
         except Exception as e:
             print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
             import time
             time.sleep(60)
     return
 def get_dataset_url(row):
-    dataset_name = row['Benchmark']
-    dataset_url = row['Dataset Link']
     benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
     return benchmark
@@ -24,8 +27,8 @@ def get_dataset_url(row):
 def get_dataset_summary_table(file_path):
     df = pd.read_csv(file_path)
-    df['Benchmark'] = df.apply(lambda x: get_dataset_url(x), axis=1)
-    df = df[['Category', 'Benchmark', 'Data Split', 'Data Size', 'Language']]
     return df

 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
         try:
+            snapshot_download(
+                repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
+            )
             return
         except Exception as e:
             print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
             import time
             time.sleep(60)
     return
 def get_dataset_url(row):
+    dataset_name = row["Benchmark"]
+    dataset_url = row["Dataset Link"]
     benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
     return benchmark
 def get_dataset_summary_table(file_path):
     df = pd.read_csv(file_path)
+    df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)
+    df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
     return df