Spaces:

CDT-BMAI-GP
/

biomed_probing_leaderboard

Runtime error

App Files Files Community

chaeeunlee commited on Feb 21, 2024

Commit

7c5b405

1 Parent(s): eaf281b

include_path debugged

Browse files

Files changed (14) hide show

.gitignore +5 -1
backend-cli.py +20 -0
manage_repos.ipynb +2 -2
requirements.txt +1 -1
src/__pycache__/populate.cpython-310.pyc +0 -0
src/backend/__pycache__/envs.cpython-310.pyc +0 -0
src/backend/__pycache__/manage_requests.cpython-310.pyc +0 -0
src/backend/__pycache__/run_eval_suite.cpython-310.pyc +0 -0
src/backend/envs.py +11 -4
src/backend/manage_requests.py +6 -1
src/backend/run_eval_suite.py +28 -4
src/display/__pycache__/utils.cpython-310.pyc +0 -0
src/display/utils.py +7 -9
src/populate.py +21 -7

.gitignore CHANGED Viewed

@@ -6,4 +6,8 @@ eval-results-bk/
 eval-queue-bk/
 src/backend/tasks/
-hub/

 eval-queue-bk/
 src/backend/tasks/
+src/backend/probing_tasks/
+hub/
+offload/
+token

backend-cli.py CHANGED Viewed

@@ -23,6 +23,26 @@ import time
 import logging
 import pprint
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):

 import logging
 import pprint
+import argparse
+# def get_subdirectories(path):
+#     subdirectories = []
+#     # Get all entries in the directory
+#     entries = os.listdir(path)
+#     for entry in entries:
+#         # Check if the entry is a directory
+#         if os.path.isdir(os.path.join(path, entry)):
+#             subdirectories.append(entry)
+#     return subdirectories
+# parser = argparse.ArgumentParser(description="Get subdirectory names")
+# parser.add_argument("include_path", help="Path to the directory", nargs='?', default=None)
+# args = parser.parse_args()
+# # = get_subdirectories(args.include_path)
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):

manage_repos.ipynb CHANGED Viewed

@@ -18,7 +18,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/test_leaderboard\n"
      ]
     },
     {
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "\n",
     "# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"

      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/huggingface_home_cache\n"
      ]
     },
     {
    "metadata": {},
    "outputs": [],
    "source": [
+    "res = API.delete_folder(path_in_repo='EleutherAI/pythia-70m_biolama_umls_eval_request_False_float32_Original.json', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "\n",
     "# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"

requirements.txt CHANGED Viewed

@@ -17,7 +17,7 @@ semantic-version
 tqdm
 transformers>=4.36.0,<4.37.0
 tokenizers>=0.15.0
-lm_eval # [ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
 accelerate
 sentencepiece
 langdetect

 tqdm
 transformers>=4.36.0,<4.37.0
 tokenizers>=0.15.0
+lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
 accelerate
 sentencepiece
 langdetect

src/__pycache__/populate.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ

src/backend/__pycache__/envs.cpython-310.pyc CHANGED Viewed

Binary files a/src/backend/__pycache__/envs.cpython-310.pyc and b/src/backend/__pycache__/envs.cpython-310.pyc differ

src/backend/__pycache__/manage_requests.cpython-310.pyc CHANGED Viewed

Binary files a/src/backend/__pycache__/manage_requests.cpython-310.pyc and b/src/backend/__pycache__/manage_requests.cpython-310.pyc differ

src/backend/__pycache__/run_eval_suite.cpython-310.pyc CHANGED Viewed

Binary files a/src/backend/__pycache__/run_eval_suite.cpython-310.pyc and b/src/backend/__pycache__/run_eval_suite.cpython-310.pyc differ

src/backend/envs.py CHANGED Viewed

@@ -11,7 +11,7 @@ from src.envs import CACHE_PATH
 @dataclass
 class Task:
     benchmark: str
-    metric: str
     col_name: str
     num_fewshot: int
@@ -21,15 +21,22 @@ class Tasks(Enum):
     # task0 = Task("pubmedqa", "acc", "PubMedQA", 0)  # 64, as in the ATLAS paper
     # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0)  # 64, as in the ATLAS paper
     # task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
-    task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
-    task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
 num_fewshots = {
     "medqa": 0,
     "medmcqa": 0,
-    "pubmedqa": 0
 }

 @dataclass
 class Task:
     benchmark: str
+    # metric: str # yeah i don't think we need this.
     col_name: str
     num_fewshot: int
     # task0 = Task("pubmedqa", "acc", "PubMedQA", 0)  # 64, as in the ATLAS paper
     # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0)  # 64, as in the ATLAS paper
     # task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
+    # task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
+    # task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
+    task0 = Task("medmcqa", "MedMCQA", 0)
+    task1 = Task("pubmedqa", "PubMedQA", 0)
+    task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
+    task3 = Task("biolama_umls", "BioLAMA-UMLS", 0)
 num_fewshots = {
     "medqa": 0,
     "medmcqa": 0,
+    "pubmedqa": 0,
+    "pubmedqa_no_context":0,
+    "biolama_umls":0,
 }

src/backend/manage_requests.py CHANGED Viewed

@@ -45,7 +45,12 @@ class EvalRequest:
     def get_model_args(self) -> str:
-        model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"

     def get_model_args(self) -> str:
+        ## added
+        if "gpt" in self.model:
+            model_args = f"model={self.model},revision={self.revision},parallelize=True"
+        else:
+            model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"

src/backend/run_eval_suite.py CHANGED Viewed

@@ -1,5 +1,10 @@
 from lm_eval import tasks, evaluator, utils
-from lm_eval.tasks import initialize_tasks, include_task_folder
 from src.backend.manage_requests import EvalRequest
@@ -17,15 +22,31 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-    include_task_folder("src/backend/tasks/")
-    initialize_tasks('INFO')
     print(f"Considered Tasks (after overriding): {task_names}")
     print(f"model_args: {eval_request.get_model_args()}")
-    results = evaluator.simple_evaluate(model="hf-auto",  # "hf-causal-experimental",  # "hf-causal"
                                         model_args=eval_request.get_model_args(),
                                         tasks=task_names,
                                         num_fewshot=num_fewshot,
                                         batch_size=batch_size,
@@ -33,6 +54,9 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
                                         device=device,
                                         use_cache=use_cache,
                                         limit=limit,
                                         write_out=True)
     results["config"]["model_dtype"] = eval_request.precision

 from lm_eval import tasks, evaluator, utils
+from lm_eval.tasks import initialize_tasks, TaskManager
+try:
+    from lm_eval.tasks import include_task_folder
+except:
+    from lm_eval.tasks import include_path
 from src.backend.manage_requests import EvalRequest
     if limit:
         print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    # try:
+    #     include_task_folder("src/backend/tasks/")
+    # except:
+    #     include_path("src/backend/tasks")
+    # initialize_tasks('INFO')
+    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
+    # indexes all tasks from the `lm_eval/tasks` subdirectory.
+    # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
+    # to include a set of tasks in a separate directory.
+    task_manager = TaskManager(include_path="src/backend/probing_tasks")
+    if "gpt" in eval_request.model:
+        model = "openai-chat-completions"
+    else:
+        model = "hf-auto"
     print(f"Considered Tasks (after overriding): {task_names}")
     print(f"model_args: {eval_request.get_model_args()}")
+    results = evaluator.simple_evaluate(model=model,  # "hf-causal-experimental",  # "hf-causal" how can i make this work for
                                         model_args=eval_request.get_model_args(),
+                                        task_manager=task_manager,
                                         tasks=task_names,
                                         num_fewshot=num_fewshot,
                                         batch_size=batch_size,
                                         device=device,
                                         use_cache=use_cache,
                                         limit=limit,
+                                        # task_manager=task_manager,
+                                        # include_path="/Users/chaeeunlee/Documents/VSC_workspaces/biomed_probing_leaderboard/src/backend/tasks",
                                         write_out=True)
     results["config"]["model_dtype"] = eval_request.precision

src/display/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ

src/display/utils.py CHANGED Viewed

@@ -16,18 +16,15 @@ class Task:
 class Tasks(Enum):
-    # arc = Task("arc:challenge", "acc_norm", "ARC")
-    # hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
     # medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
     medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
-    # mmlu = Task("hendrycksTest", "acc", "MMLU")
-    # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
-    # winogrande = Task("winogrande", "acc", "Winogrande")
-    # gsm8k = Task("gsm8k", "acc", "GSM8K")
-    # drop = Task("drop", "f1", "DROP")
     pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
@@ -40,6 +37,7 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
     dummy: bool = False
 auto_eval_column_dict = []
 # Init
@@ -48,7 +46,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) # hidden was true by default
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 class Tasks(Enum):
     # medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
+    # am i just manually going to include everything? hmm for display, idk how easily do i want to be able to tick this on and off?
+    # where does the acc_norm come from
     medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
     pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
+    # task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
+    pubmedqa_no_context = Task("pubmedqa_no_context", "acc", "PubMedQA_no_context") # adding this throws an error. -> value=leaderboard_df[
+    biolama_umls = Task("biolama_umls", "acc", "BioLAMA-UMLS")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
     hidden: bool = False
     never_hidden: bool = False
     dummy: bool = False
+    is_task: bool = False
 auto_eval_column_dict = []
 # Init
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, is_task=True)]) # hidden was true by default
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

src/populate.py CHANGED Viewed

@@ -22,8 +22,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     raw_data = get_raw_eval_results(results_path, requests_path)
-    # print(f"@@raw_data = {raw_data}")
     all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
     # all_data_json.append(baseline_row)
     filter_models(all_data_json)
@@ -31,12 +29,28 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     print(f"all_data_json  = {all_data_json}")
     df = pd.DataFrame.from_records(all_data_json)
-    # if AutoEvalColumn.average.name in df:
-    #     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    #     df = df[cols].round(decimals=2)
-    #     # filter out if any of the benchmarks have not been produced
-    #     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
     # all_data_json.append(baseline_row)
     filter_models(all_data_json)
     print(f"all_data_json  = {all_data_json}")
     df = pd.DataFrame.from_records(all_data_json)
+    task_attributes = []
+    # Iterate over all attributes of AutoEvalColumn class
+    for attr_name in dir(AutoEvalColumn):
+        # Retrieve the attribute object
+        attr = getattr(AutoEvalColumn, attr_name)
+        # Check if the attribute has 'is_task' attribute and it is True
+        if hasattr(attr, 'is_task') and getattr(attr, 'is_task'):
+            task_attributes.append(attr)
+    # Now task_attributes contains all attributes where is_task=True
+    # print(task_attributes)
+    task_col_names_all = [str(item.name) for item in task_attributes]
+    # import pdb; pdb.set_trace()
+    # Add empty columns with specified names
+    for col_name in task_col_names_all:
+        if col_name not in df.columns:
+            df[col_name] = None
     return raw_data, df