Spaces:
Runtime error
Runtime error
Commit
·
7c5b405
1
Parent(s):
eaf281b
include_path debugged
Browse files- .gitignore +5 -1
- backend-cli.py +20 -0
- manage_repos.ipynb +2 -2
- requirements.txt +1 -1
- src/__pycache__/populate.cpython-310.pyc +0 -0
- src/backend/__pycache__/envs.cpython-310.pyc +0 -0
- src/backend/__pycache__/manage_requests.cpython-310.pyc +0 -0
- src/backend/__pycache__/run_eval_suite.cpython-310.pyc +0 -0
- src/backend/envs.py +11 -4
- src/backend/manage_requests.py +6 -1
- src/backend/run_eval_suite.py +28 -4
- src/display/__pycache__/utils.cpython-310.pyc +0 -0
- src/display/utils.py +7 -9
- src/populate.py +21 -7
.gitignore
CHANGED
@@ -6,4 +6,8 @@ eval-results-bk/
|
|
6 |
eval-queue-bk/
|
7 |
|
8 |
src/backend/tasks/
|
9 |
-
|
|
|
|
|
|
|
|
|
|
6 |
eval-queue-bk/
|
7 |
|
8 |
src/backend/tasks/
|
9 |
+
src/backend/probing_tasks/
|
10 |
+
hub/
|
11 |
+
offload/
|
12 |
+
|
13 |
+
token
|
backend-cli.py
CHANGED
@@ -23,6 +23,26 @@ import time
|
|
23 |
|
24 |
import logging
|
25 |
import pprint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
|
|
23 |
|
24 |
import logging
|
25 |
import pprint
|
26 |
+
import argparse
|
27 |
+
|
28 |
+
|
29 |
+
# def get_subdirectories(path):
|
30 |
+
# subdirectories = []
|
31 |
+
# # Get all entries in the directory
|
32 |
+
# entries = os.listdir(path)
|
33 |
+
# for entry in entries:
|
34 |
+
# # Check if the entry is a directory
|
35 |
+
# if os.path.isdir(os.path.join(path, entry)):
|
36 |
+
# subdirectories.append(entry)
|
37 |
+
# return subdirectories
|
38 |
+
|
39 |
+
# parser = argparse.ArgumentParser(description="Get subdirectory names")
|
40 |
+
# parser.add_argument("include_path", help="Path to the directory", nargs='?', default=None)
|
41 |
+
# args = parser.parse_args()
|
42 |
+
|
43 |
+
# # = get_subdirectories(args.include_path)
|
44 |
+
|
45 |
+
|
46 |
|
47 |
|
48 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
manage_repos.ipynb
CHANGED
@@ -18,7 +18,7 @@
|
|
18 |
"name": "stdout",
|
19 |
"output_type": "stream",
|
20 |
"text": [
|
21 |
-
"CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/
|
22 |
]
|
23 |
},
|
24 |
{
|
@@ -101,7 +101,7 @@
|
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
103 |
"source": [
|
104 |
-
"res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
|
105 |
"# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
|
106 |
"\n",
|
107 |
"# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"
|
|
|
18 |
"name": "stdout",
|
19 |
"output_type": "stream",
|
20 |
"text": [
|
21 |
+
"CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/huggingface_home_cache\n"
|
22 |
]
|
23 |
},
|
24 |
{
|
|
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
103 |
"source": [
|
104 |
+
"res = API.delete_folder(path_in_repo='EleutherAI/pythia-70m_biolama_umls_eval_request_False_float32_Original.json', repo_id=QUEUE_REPO, repo_type='dataset')\n",
|
105 |
"# res = API.delete_folder(path_in_repo='mistralai/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
|
106 |
"\n",
|
107 |
"# res = API.delete_file(path_in_repo=\"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\", repo_id=QUEUE_REPO, repo_type='dataset')\n"
|
requirements.txt
CHANGED
@@ -17,7 +17,7 @@ semantic-version
|
|
17 |
tqdm
|
18 |
transformers>=4.36.0,<4.37.0
|
19 |
tokenizers>=0.15.0
|
20 |
-
lm_eval
|
21 |
accelerate
|
22 |
sentencepiece
|
23 |
langdetect
|
|
|
17 |
tqdm
|
18 |
transformers>=4.36.0,<4.37.0
|
19 |
tokenizers>=0.15.0
|
20 |
+
lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
|
21 |
accelerate
|
22 |
sentencepiece
|
23 |
langdetect
|
src/__pycache__/populate.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
|
|
src/backend/__pycache__/envs.cpython-310.pyc
CHANGED
Binary files a/src/backend/__pycache__/envs.cpython-310.pyc and b/src/backend/__pycache__/envs.cpython-310.pyc differ
|
|
src/backend/__pycache__/manage_requests.cpython-310.pyc
CHANGED
Binary files a/src/backend/__pycache__/manage_requests.cpython-310.pyc and b/src/backend/__pycache__/manage_requests.cpython-310.pyc differ
|
|
src/backend/__pycache__/run_eval_suite.cpython-310.pyc
CHANGED
Binary files a/src/backend/__pycache__/run_eval_suite.cpython-310.pyc and b/src/backend/__pycache__/run_eval_suite.cpython-310.pyc differ
|
|
src/backend/envs.py
CHANGED
@@ -11,7 +11,7 @@ from src.envs import CACHE_PATH
|
|
11 |
@dataclass
|
12 |
class Task:
|
13 |
benchmark: str
|
14 |
-
metric: str
|
15 |
col_name: str
|
16 |
num_fewshot: int
|
17 |
|
@@ -21,15 +21,22 @@ class Tasks(Enum):
|
|
21 |
# task0 = Task("pubmedqa", "acc", "PubMedQA", 0) # 64, as in the ATLAS paper
|
22 |
# task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0) # 64, as in the ATLAS paper
|
23 |
# task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
|
24 |
-
task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
|
25 |
-
task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
|
29 |
num_fewshots = {
|
30 |
"medqa": 0,
|
31 |
"medmcqa": 0,
|
32 |
-
"pubmedqa": 0
|
|
|
|
|
33 |
}
|
34 |
|
35 |
|
|
|
11 |
@dataclass
|
12 |
class Task:
|
13 |
benchmark: str
|
14 |
+
# metric: str # yeah i don't think we need this.
|
15 |
col_name: str
|
16 |
num_fewshot: int
|
17 |
|
|
|
21 |
# task0 = Task("pubmedqa", "acc", "PubMedQA", 0) # 64, as in the ATLAS paper
|
22 |
# task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0) # 64, as in the ATLAS paper
|
23 |
# task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
|
24 |
+
# task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
|
25 |
+
# task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
|
26 |
+
|
27 |
+
task0 = Task("medmcqa", "MedMCQA", 0)
|
28 |
+
task1 = Task("pubmedqa", "PubMedQA", 0)
|
29 |
+
task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
|
30 |
+
task3 = Task("biolama_umls", "BioLAMA-UMLS", 0)
|
31 |
|
32 |
|
33 |
|
34 |
num_fewshots = {
|
35 |
"medqa": 0,
|
36 |
"medmcqa": 0,
|
37 |
+
"pubmedqa": 0,
|
38 |
+
"pubmedqa_no_context":0,
|
39 |
+
"biolama_umls":0,
|
40 |
}
|
41 |
|
42 |
|
src/backend/manage_requests.py
CHANGED
@@ -45,7 +45,12 @@ class EvalRequest:
|
|
45 |
|
46 |
|
47 |
def get_model_args(self) -> str:
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
51 |
model_args += f",dtype={self.precision}"
|
|
|
45 |
|
46 |
|
47 |
def get_model_args(self) -> str:
|
48 |
+
|
49 |
+
## added
|
50 |
+
if "gpt" in self.model:
|
51 |
+
model_args = f"model={self.model},revision={self.revision},parallelize=True"
|
52 |
+
else:
|
53 |
+
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
|
54 |
|
55 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
56 |
model_args += f",dtype={self.precision}"
|
src/backend/run_eval_suite.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
from lm_eval import tasks, evaluator, utils
|
2 |
-
from lm_eval.tasks import initialize_tasks,
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
from src.backend.manage_requests import EvalRequest
|
5 |
|
@@ -17,15 +22,31 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
17 |
if limit:
|
18 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
print(f"Considered Tasks (after overriding): {task_names}")
|
24 |
|
25 |
print(f"model_args: {eval_request.get_model_args()}")
|
26 |
|
27 |
-
results = evaluator.simple_evaluate(model=
|
28 |
model_args=eval_request.get_model_args(),
|
|
|
29 |
tasks=task_names,
|
30 |
num_fewshot=num_fewshot,
|
31 |
batch_size=batch_size,
|
@@ -33,6 +54,9 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
33 |
device=device,
|
34 |
use_cache=use_cache,
|
35 |
limit=limit,
|
|
|
|
|
|
|
36 |
write_out=True)
|
37 |
|
38 |
results["config"]["model_dtype"] = eval_request.precision
|
|
|
1 |
from lm_eval import tasks, evaluator, utils
|
2 |
+
from lm_eval.tasks import initialize_tasks, TaskManager
|
3 |
+
|
4 |
+
try:
|
5 |
+
from lm_eval.tasks import include_task_folder
|
6 |
+
except:
|
7 |
+
from lm_eval.tasks import include_path
|
8 |
|
9 |
from src.backend.manage_requests import EvalRequest
|
10 |
|
|
|
22 |
if limit:
|
23 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
24 |
|
25 |
+
|
26 |
+
# try:
|
27 |
+
# include_task_folder("src/backend/tasks/")
|
28 |
+
# except:
|
29 |
+
# include_path("src/backend/tasks")
|
30 |
+
|
31 |
+
# initialize_tasks('INFO')
|
32 |
+
# https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
|
33 |
+
# indexes all tasks from the `lm_eval/tasks` subdirectory.
|
34 |
+
# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
|
35 |
+
# to include a set of tasks in a separate directory.
|
36 |
+
task_manager = TaskManager(include_path="src/backend/probing_tasks")
|
37 |
+
|
38 |
+
if "gpt" in eval_request.model:
|
39 |
+
model = "openai-chat-completions"
|
40 |
+
else:
|
41 |
+
model = "hf-auto"
|
42 |
|
43 |
print(f"Considered Tasks (after overriding): {task_names}")
|
44 |
|
45 |
print(f"model_args: {eval_request.get_model_args()}")
|
46 |
|
47 |
+
results = evaluator.simple_evaluate(model=model, # "hf-causal-experimental", # "hf-causal" how can i make this work for
|
48 |
model_args=eval_request.get_model_args(),
|
49 |
+
task_manager=task_manager,
|
50 |
tasks=task_names,
|
51 |
num_fewshot=num_fewshot,
|
52 |
batch_size=batch_size,
|
|
|
54 |
device=device,
|
55 |
use_cache=use_cache,
|
56 |
limit=limit,
|
57 |
+
|
58 |
+
# task_manager=task_manager,
|
59 |
+
# include_path="/Users/chaeeunlee/Documents/VSC_workspaces/biomed_probing_leaderboard/src/backend/tasks",
|
60 |
write_out=True)
|
61 |
|
62 |
results["config"]["model_dtype"] = eval_request.precision
|
src/display/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ
|
|
src/display/utils.py
CHANGED
@@ -16,18 +16,15 @@ class Task:
|
|
16 |
|
17 |
|
18 |
class Tasks(Enum):
|
19 |
-
# arc = Task("arc:challenge", "acc_norm", "ARC")
|
20 |
-
# hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
21 |
|
22 |
# medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
|
|
|
|
|
23 |
medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
|
24 |
-
# mmlu = Task("hendrycksTest", "acc", "MMLU")
|
25 |
-
# truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
26 |
-
# winogrande = Task("winogrande", "acc", "Winogrande")
|
27 |
-
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
28 |
-
# drop = Task("drop", "f1", "DROP")
|
29 |
-
|
30 |
pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
|
|
|
|
|
|
|
31 |
|
32 |
# These classes are for user facing column names,
|
33 |
# to avoid having to change them all around the code
|
@@ -40,6 +37,7 @@ class ColumnContent:
|
|
40 |
hidden: bool = False
|
41 |
never_hidden: bool = False
|
42 |
dummy: bool = False
|
|
|
43 |
|
44 |
auto_eval_column_dict = []
|
45 |
# Init
|
@@ -48,7 +46,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
48 |
#Scores
|
49 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
50 |
for task in Tasks:
|
51 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) # hidden was true by default
|
52 |
# Model information
|
53 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
54 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
16 |
|
17 |
|
18 |
class Tasks(Enum):
|
|
|
|
|
19 |
|
20 |
# medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
|
21 |
+
# am i just manually going to include everything? hmm for display, idk how easily do i want to be able to tick this on and off?
|
22 |
+
# where does the acc_norm come from
|
23 |
medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
|
25 |
+
# task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
|
26 |
+
pubmedqa_no_context = Task("pubmedqa_no_context", "acc", "PubMedQA_no_context") # adding this throws an error. -> value=leaderboard_df[
|
27 |
+
biolama_umls = Task("biolama_umls", "acc", "BioLAMA-UMLS")
|
28 |
|
29 |
# These classes are for user facing column names,
|
30 |
# to avoid having to change them all around the code
|
|
|
37 |
hidden: bool = False
|
38 |
never_hidden: bool = False
|
39 |
dummy: bool = False
|
40 |
+
is_task: bool = False
|
41 |
|
42 |
auto_eval_column_dict = []
|
43 |
# Init
|
|
|
46 |
#Scores
|
47 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
48 |
for task in Tasks:
|
49 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, is_task=True)]) # hidden was true by default
|
50 |
# Model information
|
51 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
52 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/populate.py
CHANGED
@@ -22,8 +22,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
22 |
|
23 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
24 |
|
25 |
-
# print(f"@@raw_data = {raw_data}")
|
26 |
-
|
27 |
all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
|
28 |
# all_data_json.append(baseline_row)
|
29 |
filter_models(all_data_json)
|
@@ -31,12 +29,28 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
31 |
print(f"all_data_json = {all_data_json}")
|
32 |
|
33 |
df = pd.DataFrame.from_records(all_data_json)
|
34 |
-
# if AutoEvalColumn.average.name in df:
|
35 |
-
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
36 |
-
# df = df[cols].round(decimals=2)
|
37 |
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
return raw_data, df
|
41 |
|
42 |
|
|
|
22 |
|
23 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
24 |
|
|
|
|
|
25 |
all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
|
26 |
# all_data_json.append(baseline_row)
|
27 |
filter_models(all_data_json)
|
|
|
29 |
print(f"all_data_json = {all_data_json}")
|
30 |
|
31 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
32 |
|
33 |
+
task_attributes = []
|
34 |
+
|
35 |
+
# Iterate over all attributes of AutoEvalColumn class
|
36 |
+
for attr_name in dir(AutoEvalColumn):
|
37 |
+
# Retrieve the attribute object
|
38 |
+
attr = getattr(AutoEvalColumn, attr_name)
|
39 |
+
# Check if the attribute has 'is_task' attribute and it is True
|
40 |
+
if hasattr(attr, 'is_task') and getattr(attr, 'is_task'):
|
41 |
+
task_attributes.append(attr)
|
42 |
+
|
43 |
+
# Now task_attributes contains all attributes where is_task=True
|
44 |
+
# print(task_attributes)
|
45 |
+
task_col_names_all = [str(item.name) for item in task_attributes]
|
46 |
+
|
47 |
+
# import pdb; pdb.set_trace()
|
48 |
+
|
49 |
+
# Add empty columns with specified names
|
50 |
+
for col_name in task_col_names_all:
|
51 |
+
if col_name not in df.columns:
|
52 |
+
df[col_name] = None
|
53 |
+
|
54 |
return raw_data, df
|
55 |
|
56 |
|