Spaces:
Sleeping
Sleeping
eval_harness_v043_updates (#10)
Browse files- Updates for newest lm_eval version, 0.4.3 (67a80c3deaeedce017896a57cc8c49a3d1a05bd2)
- Minor import reordering (62c033216a47fba3781e9e05df0f05d1a3e66c3d)
Co-authored-by: Margaret Mitchell <meg@users.noreply.huggingface.co>
- app.py +2 -2
- main_backend_harness.py +1 -2
- requirements.txt +2 -2
- src/backend/run_eval_suite_harness.py +12 -9
app.py
CHANGED
@@ -8,8 +8,8 @@ configure_root_logger()
|
|
8 |
from functools import partial
|
9 |
|
10 |
import gradio as gr
|
11 |
-
from main_backend_lighteval import run_auto_eval
|
12 |
-
|
13 |
from src.display.log_visualizer import log_file_to_html_string
|
14 |
from src.display.css_html_js import dark_mode_gradio_js
|
15 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
|
|
8 |
from functools import partial
|
9 |
|
10 |
import gradio as gr
|
11 |
+
# from main_backend_lighteval import run_auto_eval
|
12 |
+
from main_backend_harness import run_auto_eval
|
13 |
from src.display.log_visualizer import log_file_to_html_string
|
14 |
from src.display.css_html_js import dark_mode_gradio_js
|
15 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
main_backend_harness.py
CHANGED
@@ -70,9 +70,8 @@ def run_auto_eval():
|
|
70 |
num_fewshot=NUM_FEWSHOT,
|
71 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
72 |
results_repo=RESULTS_REPO,
|
73 |
-
batch_size=
|
74 |
device=DEVICE,
|
75 |
-
no_cache=True,
|
76 |
limit=LIMIT
|
77 |
)
|
78 |
|
|
|
70 |
num_fewshot=NUM_FEWSHOT,
|
71 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
72 |
results_repo=RESULTS_REPO,
|
73 |
+
batch_size="auto",
|
74 |
device=DEVICE,
|
|
|
75 |
limit=LIMIT
|
76 |
)
|
77 |
|
requirements.txt
CHANGED
@@ -5,12 +5,12 @@ huggingface-hub>=0.18.0
|
|
5 |
python-dateutil==2.8.2
|
6 |
requests==2.28.2
|
7 |
tqdm==4.65.0
|
8 |
-
accelerate
|
9 |
sentencepiece
|
10 |
|
11 |
# Evaluation suites
|
12 |
lighteval
|
13 |
-
lm_eval
|
14 |
|
15 |
# Log Visualizer
|
16 |
BeautifulSoup4==4.12.2
|
|
|
5 |
python-dateutil==2.8.2
|
6 |
requests==2.28.2
|
7 |
tqdm==4.65.0
|
8 |
+
accelerate>=0.26.0
|
9 |
sentencepiece
|
10 |
|
11 |
# Evaluation suites
|
12 |
lighteval
|
13 |
+
lm_eval==0.4.3
|
14 |
|
15 |
# Log Visualizer
|
16 |
BeautifulSoup4==4.12.2
|
src/backend/run_eval_suite_harness.py
CHANGED
@@ -4,26 +4,29 @@ import logging
|
|
4 |
from datetime import datetime
|
5 |
|
6 |
from lm_eval import tasks, evaluator, utils
|
|
|
7 |
|
8 |
from src.envs import RESULTS_REPO, API
|
9 |
from src.backend.manage_requests import EvalRequest
|
10 |
from src.logging import setup_logger
|
11 |
|
|
|
|
|
12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
-
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
16 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
17 |
|
18 |
Args:
|
19 |
eval_request (EvalRequest): Input evaluation request file representation
|
20 |
task_names (list): Tasks to launch
|
21 |
num_fewshot (int): Number of few shots to use
|
22 |
-
batch_size (int): Selected batch size
|
23 |
-
device (str): "cpu" or "
|
24 |
local_dir (str): Where to save the results locally
|
25 |
results_repo (str): To which repository to upload the results
|
26 |
-
no_cache (bool, optional): Whether to use a cache or not
|
27 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
28 |
|
29 |
Returns:
|
@@ -34,21 +37,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
|
|
34 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
35 |
)
|
36 |
|
37 |
-
|
|
|
|
|
38 |
|
39 |
logger.info(f"Selected Tasks: {task_names}")
|
40 |
|
41 |
results = evaluator.simple_evaluate(
|
42 |
-
model="hf
|
43 |
model_args=eval_request.get_model_args(),
|
44 |
tasks=task_names,
|
45 |
num_fewshot=num_fewshot,
|
46 |
batch_size=batch_size,
|
47 |
device=device,
|
48 |
-
no_cache=no_cache,
|
49 |
limit=limit,
|
50 |
-
write_out=True,
|
51 |
-
output_base_path="logs"
|
52 |
)
|
53 |
|
54 |
results["config"]["model_dtype"] = eval_request.precision
|
|
|
4 |
from datetime import datetime
|
5 |
|
6 |
from lm_eval import tasks, evaluator, utils
|
7 |
+
from lm_eval.tasks import TaskManager
|
8 |
|
9 |
from src.envs import RESULTS_REPO, API
|
10 |
from src.backend.manage_requests import EvalRequest
|
11 |
from src.logging import setup_logger
|
12 |
|
13 |
+
from typing import Union
|
14 |
+
|
15 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
16 |
logger = setup_logger(__name__)
|
17 |
|
18 |
+
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
19 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
20 |
|
21 |
Args:
|
22 |
eval_request (EvalRequest): Input evaluation request file representation
|
23 |
task_names (list): Tasks to launch
|
24 |
num_fewshot (int): Number of few shots to use
|
25 |
+
batch_size (int or str): Selected batch size or 'auto'
|
26 |
+
device (str): "cpu" or "cuda:0", depending on what you assigned to the space
|
27 |
local_dir (str): Where to save the results locally
|
28 |
results_repo (str): To which repository to upload the results
|
29 |
+
no_cache (bool, optional): Whether to use a cache or not
|
30 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
31 |
|
32 |
Returns:
|
|
|
37 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
38 |
)
|
39 |
|
40 |
+
task_manager = TaskManager()
|
41 |
+
all_tasks = task_manager.all_tasks
|
42 |
+
task_names = utils.pattern_match(task_names, all_tasks)
|
43 |
|
44 |
logger.info(f"Selected Tasks: {task_names}")
|
45 |
|
46 |
results = evaluator.simple_evaluate(
|
47 |
+
model="hf",
|
48 |
model_args=eval_request.get_model_args(),
|
49 |
tasks=task_names,
|
50 |
num_fewshot=num_fewshot,
|
51 |
batch_size=batch_size,
|
52 |
device=device,
|
|
|
53 |
limit=limit,
|
54 |
+
write_out=True # Whether to write out an example document and model input, for checking task integrity
|
|
|
55 |
)
|
56 |
|
57 |
results["config"]["model_dtype"] = eval_request.precision
|