chaeeunlee commited on
Commit
7c5b405
·
1 Parent(s): eaf281b

include_path debugged

Browse files
.gitignore CHANGED
@@ -6,4 +6,8 @@ eval-results-bk/
6
  eval-queue-bk/
7
 
8
  src/backend/tasks/
9
- hub/
 
 
 
 
 
6
  eval-queue-bk/
7
 
8
  src/backend/tasks/
9
+ src/backend/probing_tasks/
10
+ hub/
11
+ offload/
12
+
13
+ token
backend-cli.py CHANGED
@@ -23,6 +23,26 @@ import time
23
 
24
  import logging
25
  import pprint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
 
23
 
24
  import logging
25
  import pprint
26
+ import argparse
27
+
28
+
29
+ # def get_subdirectories(path):
30
+ # subdirectories = []
31
+ # # Get all entries in the directory
32
+ # entries = os.listdir(path)
33
+ # for entry in entries:
34
+ # # Check if the entry is a directory
35
+ # if os.path.isdir(os.path.join(path, entry)):
36
+ # subdirectories.append(entry)
37
+ # return subdirectories
38
+
39
+ # parser = argparse.ArgumentParser(description="Get subdirectory names")
40
+ # parser.add_argument("include_path", help="Path to the directory", nargs='?', default=None)
41
+ # args = parser.parse_args()
42
+
43
+ # # = get_subdirectories(args.include_path)
44
+
45
+
46
 
47
 
48
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
manage_repos.ipynb CHANGED
@@ -18,7 +18,7 @@
18
  "name": "stdout",
19
  "output_type": "stream",
20
  "text": [
21
- "CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/test_leaderboard\n"
22
  ]
23
  },
24
  {
@@ -101,7 +101,7 @@
101
  "metadata": {},
102
  "outputs": [],
103
  "source": [
104
- "res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
105
  "# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
106
  "\n",
107
  "# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"
 
18
  "name": "stdout",
19
  "output_type": "stream",
20
  "text": [
21
+ "CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/huggingface_home_cache\n"
22
  ]
23
  },
24
  {
 
101
  "metadata": {},
102
  "outputs": [],
103
  "source": [
104
+ "res = API.delete_folder(path_in_repo='EleutherAI/pythia-70m_biolama_umls_eval_request_False_float32_Original.json', repo_id=QUEUE_REPO, repo_type='dataset')\n",
105
  "# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
106
  "\n",
107
  "# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"
requirements.txt CHANGED
@@ -17,7 +17,7 @@ semantic-version
17
  tqdm
18
  transformers>=4.36.0,<4.37.0
19
  tokenizers>=0.15.0
20
- lm_eval # [ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
21
  accelerate
22
  sentencepiece
23
  langdetect
 
17
  tqdm
18
  transformers>=4.36.0,<4.37.0
19
  tokenizers>=0.15.0
20
+ lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
21
  accelerate
22
  sentencepiece
23
  langdetect
src/__pycache__/populate.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
 
src/backend/__pycache__/envs.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/envs.cpython-310.pyc and b/src/backend/__pycache__/envs.cpython-310.pyc differ
 
src/backend/__pycache__/manage_requests.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/manage_requests.cpython-310.pyc and b/src/backend/__pycache__/manage_requests.cpython-310.pyc differ
 
src/backend/__pycache__/run_eval_suite.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/run_eval_suite.cpython-310.pyc and b/src/backend/__pycache__/run_eval_suite.cpython-310.pyc differ
 
src/backend/envs.py CHANGED
@@ -11,7 +11,7 @@ from src.envs import CACHE_PATH
11
  @dataclass
12
  class Task:
13
  benchmark: str
14
- metric: str
15
  col_name: str
16
  num_fewshot: int
17
 
@@ -21,15 +21,22 @@ class Tasks(Enum):
21
  # task0 = Task("pubmedqa", "acc", "PubMedQA", 0) # 64, as in the ATLAS paper
22
  # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0) # 64, as in the ATLAS paper
23
  # task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
24
- task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
25
- task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
 
 
 
 
 
26
 
27
 
28
 
29
  num_fewshots = {
30
  "medqa": 0,
31
  "medmcqa": 0,
32
- "pubmedqa": 0
 
 
33
  }
34
 
35
 
 
11
  @dataclass
12
  class Task:
13
  benchmark: str
14
+ # metric: str # yeah i don't think we need this.
15
  col_name: str
16
  num_fewshot: int
17
 
 
21
  # task0 = Task("pubmedqa", "acc", "PubMedQA", 0) # 64, as in the ATLAS paper
22
  # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0) # 64, as in the ATLAS paper
23
  # task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
24
+ # task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
25
+ # task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
26
+
27
+ task0 = Task("medmcqa", "MedMCQA", 0)
28
+ task1 = Task("pubmedqa", "PubMedQA", 0)
29
+ task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
30
+ task3 = Task("biolama_umls", "BioLAMA-UMLS", 0)
31
 
32
 
33
 
34
  num_fewshots = {
35
  "medqa": 0,
36
  "medmcqa": 0,
37
+ "pubmedqa": 0,
38
+ "pubmedqa_no_context":0,
39
+ "biolama_umls":0,
40
  }
41
 
42
 
src/backend/manage_requests.py CHANGED
@@ -45,7 +45,12 @@ class EvalRequest:
45
 
46
 
47
  def get_model_args(self) -> str:
48
- model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
 
 
 
 
 
49
 
50
  if self.precision in ["float16", "float32", "bfloat16"]:
51
  model_args += f",dtype={self.precision}"
 
45
 
46
 
47
  def get_model_args(self) -> str:
48
+
49
+ ## added
50
+ if "gpt" in self.model:
51
+ model_args = f"model={self.model},revision={self.revision},parallelize=True"
52
+ else:
53
+ model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
54
 
55
  if self.precision in ["float16", "float32", "bfloat16"]:
56
  model_args += f",dtype={self.precision}"
src/backend/run_eval_suite.py CHANGED
@@ -1,5 +1,10 @@
1
  from lm_eval import tasks, evaluator, utils
2
- from lm_eval.tasks import initialize_tasks, include_task_folder
 
 
 
 
 
3
 
4
  from src.backend.manage_requests import EvalRequest
5
 
@@ -17,15 +22,31 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
17
  if limit:
18
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
19
 
20
- include_task_folder("src/backend/tasks/")
21
- initialize_tasks('INFO')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  print(f"Considered Tasks (after overriding): {task_names}")
24
 
25
  print(f"model_args: {eval_request.get_model_args()}")
26
 
27
- results = evaluator.simple_evaluate(model="hf-auto", # "hf-causal-experimental", # "hf-causal"
28
  model_args=eval_request.get_model_args(),
 
29
  tasks=task_names,
30
  num_fewshot=num_fewshot,
31
  batch_size=batch_size,
@@ -33,6 +54,9 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
33
  device=device,
34
  use_cache=use_cache,
35
  limit=limit,
 
 
 
36
  write_out=True)
37
 
38
  results["config"]["model_dtype"] = eval_request.precision
 
1
  from lm_eval import tasks, evaluator, utils
2
+ from lm_eval.tasks import initialize_tasks, TaskManager
3
+
4
+ try:
5
+ from lm_eval.tasks import include_task_folder
6
+ except:
7
+ from lm_eval.tasks import include_path
8
 
9
  from src.backend.manage_requests import EvalRequest
10
 
 
22
  if limit:
23
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
24
 
25
+
26
+ # try:
27
+ # include_task_folder("src/backend/tasks/")
28
+ # except:
29
+ # include_path("src/backend/tasks")
30
+
31
+ # initialize_tasks('INFO')
32
+ # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
33
+ # indexes all tasks from the `lm_eval/tasks` subdirectory.
34
+ # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
35
+ # to include a set of tasks in a separate directory.
36
+ task_manager = TaskManager(include_path="src/backend/probing_tasks")
37
+
38
+ if "gpt" in eval_request.model:
39
+ model = "openai-chat-completions"
40
+ else:
41
+ model = "hf-auto"
42
 
43
  print(f"Considered Tasks (after overriding): {task_names}")
44
 
45
  print(f"model_args: {eval_request.get_model_args()}")
46
 
47
+ results = evaluator.simple_evaluate(model=model, # "hf-causal-experimental", # "hf-causal" how can i make this work for
48
  model_args=eval_request.get_model_args(),
49
+ task_manager=task_manager,
50
  tasks=task_names,
51
  num_fewshot=num_fewshot,
52
  batch_size=batch_size,
 
54
  device=device,
55
  use_cache=use_cache,
56
  limit=limit,
57
+
58
+ # task_manager=task_manager,
59
+ # include_path="/Users/chaeeunlee/Documents/VSC_workspaces/biomed_probing_leaderboard/src/backend/tasks",
60
  write_out=True)
61
 
62
  results["config"]["model_dtype"] = eval_request.precision
src/display/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ
 
src/display/utils.py CHANGED
@@ -16,18 +16,15 @@ class Task:
16
 
17
 
18
  class Tasks(Enum):
19
- # arc = Task("arc:challenge", "acc_norm", "ARC")
20
- # hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
21
 
22
  # medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
 
 
23
  medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
24
- # mmlu = Task("hendrycksTest", "acc", "MMLU")
25
- # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
26
- # winogrande = Task("winogrande", "acc", "Winogrande")
27
- # gsm8k = Task("gsm8k", "acc", "GSM8K")
28
- # drop = Task("drop", "f1", "DROP")
29
-
30
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
 
 
 
31
 
32
  # These classes are for user facing column names,
33
  # to avoid having to change them all around the code
@@ -40,6 +37,7 @@ class ColumnContent:
40
  hidden: bool = False
41
  never_hidden: bool = False
42
  dummy: bool = False
 
43
 
44
  auto_eval_column_dict = []
45
  # Init
@@ -48,7 +46,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
48
  #Scores
49
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
50
  for task in Tasks:
51
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) # hidden was true by default
52
  # Model information
53
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
54
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
16
 
17
 
18
  class Tasks(Enum):
 
 
19
 
20
  # medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
21
+ # am i just manually going to include everything? hmm for display, idk how easily do i want to be able to tick this on and off?
22
+ # where does the acc_norm come from
23
  medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
 
 
 
 
 
 
24
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
25
+ # task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
26
+ pubmedqa_no_context = Task("pubmedqa_no_context", "acc", "PubMedQA_no_context") # adding this throws an error. -> value=leaderboard_df[
27
+ biolama_umls = Task("biolama_umls", "acc", "BioLAMA-UMLS")
28
 
29
  # These classes are for user facing column names,
30
  # to avoid having to change them all around the code
 
37
  hidden: bool = False
38
  never_hidden: bool = False
39
  dummy: bool = False
40
+ is_task: bool = False
41
 
42
  auto_eval_column_dict = []
43
  # Init
 
46
  #Scores
47
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
48
  for task in Tasks:
49
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, is_task=True)]) # hidden was true by default
50
  # Model information
51
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
52
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/populate.py CHANGED
@@ -22,8 +22,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
22
 
23
  raw_data = get_raw_eval_results(results_path, requests_path)
24
 
25
- # print(f"@@raw_data = {raw_data}")
26
-
27
  all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
28
  # all_data_json.append(baseline_row)
29
  filter_models(all_data_json)
@@ -31,12 +29,28 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
31
  print(f"all_data_json = {all_data_json}")
32
 
33
  df = pd.DataFrame.from_records(all_data_json)
34
- # if AutoEvalColumn.average.name in df:
35
- # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
36
- # df = df[cols].round(decimals=2)
37
 
38
- # # filter out if any of the benchmarks have not been produced
39
- # df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  return raw_data, df
41
 
42
 
 
22
 
23
  raw_data = get_raw_eval_results(results_path, requests_path)
24
 
 
 
25
  all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
26
  # all_data_json.append(baseline_row)
27
  filter_models(all_data_json)
 
29
  print(f"all_data_json = {all_data_json}")
30
 
31
  df = pd.DataFrame.from_records(all_data_json)
 
 
 
32
 
33
+ task_attributes = []
34
+
35
+ # Iterate over all attributes of AutoEvalColumn class
36
+ for attr_name in dir(AutoEvalColumn):
37
+ # Retrieve the attribute object
38
+ attr = getattr(AutoEvalColumn, attr_name)
39
+ # Check if the attribute has 'is_task' attribute and it is True
40
+ if hasattr(attr, 'is_task') and getattr(attr, 'is_task'):
41
+ task_attributes.append(attr)
42
+
43
+ # Now task_attributes contains all attributes where is_task=True
44
+ # print(task_attributes)
45
+ task_col_names_all = [str(item.name) for item in task_attributes]
46
+
47
+ # import pdb; pdb.set_trace()
48
+
49
+ # Add empty columns with specified names
50
+ for col_name in task_col_names_all:
51
+ if col_name not in df.columns:
52
+ df[col_name] = None
53
+
54
  return raw_data, df
55
 
56