Quentin Gallouédec commited on
Commit
74e3b17
1 Parent(s): e462d51

back and front!

Browse files
.gitignore CHANGED
@@ -12,4 +12,6 @@ eval-results/
12
  eval-queue-bk/
13
  eval-results-bk/
14
  logs/
15
- output.log
 
 
 
12
  eval-queue-bk/
13
  eval-results-bk/
14
  logs/
15
+ output.log
16
+ env
17
+ .DS_Store
Makefile CHANGED
@@ -2,12 +2,12 @@
2
 
3
 
4
  style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
 
9
 
10
  quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
2
 
3
 
4
  style:
5
+ python -m black --line-length 119 scripts src app.py
6
+ python -m isort scripts src app.py
7
+ ruff check --fix scripts src app.py
8
 
9
 
10
  quality:
11
+ python -m black --check --line-length 119 scripts src app.py
12
+ python -m isort --check-only scripts src app.py
13
+ ruff check scripts src app.py
app.py CHANGED
@@ -1,62 +1,262 @@
 
 
 
1
  import logging
2
- from src.logging import configure_root_logger
 
3
 
4
- logging.getLogger("httpx").setLevel(logging.WARNING)
5
- logging.getLogger("numexpr").setLevel(logging.WARNING)
6
- logging.getLogger("absl").setLevel(logging.WARNING)
7
- configure_root_logger()
 
 
 
 
8
 
9
- from functools import partial
 
 
10
 
11
- import gradio as gr
12
- from main_backend_harness import run_auto_eval
13
- from src.display.log_visualizer import log_file_to_html_string
14
- from src.display.css_html_js import dark_mode_gradio_js
15
- from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
16
- from src.logging import setup_logger, log_file
17
 
18
- logging.basicConfig(level=logging.INFO)
19
  logger = setup_logger(__name__)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- intro_md = f"""
23
- # Intro
24
- This is a visual for the auto evaluator.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """
26
 
27
- links_md = f"""
28
- # Important links
29
- | Description | Link |
30
- |-----------------|------|
31
- | Leaderboard | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
32
- | Queue Repo | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
33
- | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
34
  """
35
 
 
 
 
36
 
37
- def button_auto_eval():
38
- logger.info("Manually triggering Auto Eval")
39
- run_auto_eval()
40
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
43
 
44
  with gr.Blocks(js=dark_mode_gradio_js) as demo:
45
- gr.Markdown(intro_md)
46
- with gr.Tab("Application"):
47
- output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
48
- with gr.Row():
49
- download_button = gr.DownloadButton("Download Log File", value=log_file)
50
- with gr.Accordion("Log View Configuration", open=False):
51
- reverse_order_checkbox.render()
52
- # Add a button that when pressed, triggers run_auto_eval
53
- button = gr.Button("Manually Run Evaluation")
54
- gr.Markdown(links_md)
 
 
 
 
 
 
 
 
 
 
55
 
56
- dummy = gr.Markdown(run_auto_eval, every=REFRESH_RATE, visible=False)
57
 
58
- button.click(fn=button_auto_eval, inputs=[], outputs=[])
 
 
59
 
60
 
61
  if __name__ == "__main__":
62
- demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
 
1
+ import fnmatch
2
+ import glob
3
+ import json
4
  import logging
5
+ import os
6
+ import pprint
7
 
8
+ import gradio as gr
9
+ import gymnasium as gym
10
+ import numpy as np
11
+ import pandas as pd
12
+ import torch
13
+ from apscheduler.schedulers.background import BackgroundScheduler
14
+ from huggingface_hub import hf_hub_download, snapshot_download
15
+ from huggingface_hub.utils._errors import EntryNotFoundError
16
 
17
+ from src.css_html_js import dark_mode_gradio_js
18
+ from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
19
+ from src.logging import configure_root_logger, setup_logger
20
 
21
+ logging.getLogger("openai").setLevel(logging.WARNING)
22
+ logger = setup_logger(__name__)
 
 
 
 
23
 
24
+ configure_root_logger()
25
  logger = setup_logger(__name__)
26
 
27
+ pp = pprint.PrettyPrinter(width=80)
28
+
29
+
30
+ ALL_ENV_IDS = [
31
+ "CartPole-v1",
32
+ # "BreakoutNoFrameskip-v4",
33
+ ]
34
+
35
+
36
+ def model_hyperlink(link, model_id):
37
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
38
+
39
+
40
+ def make_clickable_model(model_id):
41
+ link = f"https://huggingface.co/{model_id}"
42
+ return model_hyperlink(link, model_id)
43
+
44
+
45
+ def pattern_match(patterns, source_list):
46
+ if isinstance(patterns, str):
47
+ patterns = [patterns]
48
+
49
+ env_ids = set()
50
+ for pattern in patterns:
51
+ for matching in fnmatch.filter(source_list, pattern):
52
+ env_ids.add(matching)
53
+ return sorted(list(env_ids))
54
+
55
+
56
+ def evaluate(model_id, revision):
57
+ tags = API.model_info(model_id, revision=revision).tags
58
+
59
+ # Extract the environment IDs from the tags (usually only one)
60
+ env_ids = pattern_match(tags, ALL_ENV_IDS)
61
+ logger.info(f"Selected environments: {env_ids}")
62
+
63
+ results = {}
64
+
65
+ # Check if the agent exists
66
+ try:
67
+ agent_path = hf_hub_download(repo_id=model_id, filename="agent.pt")
68
+ except EntryNotFoundError:
69
+ logger.error("Agent not found")
70
+ return None
71
+
72
+ # Check safety
73
+ security = next(iter(API.list_files_info(model_id, "agent.pt", expand=True))).security
74
+ if security is None or "safe" not in security:
75
+ logger.error("Agent safety not available")
76
+ return None
77
+ elif not security["safe"]:
78
+ logger.error("Agent not safe")
79
+ return None
80
+
81
+ # Load the agent
82
+ try:
83
+ agent = torch.jit.load(agent_path)
84
+ except Exception as e:
85
+ logger.error(f"Error loading agent: {e}")
86
+ return None
87
+
88
+ # Evaluate the agent on the environments
89
+ for env_id in env_ids:
90
+ episodic_rewards = []
91
+ env = gym.make(env_id)
92
+ for _ in range(10):
93
+ episodic_reward = 0.0
94
+ observation, info = env.reset()
95
+ done = False
96
+ while not done:
97
+ torch_observation = torch.from_numpy(np.array([observation]))
98
+ action = agent(torch_observation).numpy()[0]
99
+ observation, reward, terminated, truncated, info = env.step(action)
100
+ done = terminated or truncated
101
+ episodic_reward += reward
102
+
103
+ episodic_rewards.append(episodic_reward)
104
+
105
+ mean_reward = np.mean(episodic_rewards)
106
+ results[env_id] = {"episodic_return": mean_reward}
107
+ return results
108
 
109
+
110
+ def _backend_routine():
111
+ # List only the text classification models
112
+ rl_models = list(API.list_models(filter="reinforcement-learning"))
113
+ logger.info(f"Found {len(rl_models)} RL models")
114
+ compatible_models = []
115
+ for model in rl_models:
116
+ filenames = [sib.rfilename for sib in model.siblings]
117
+ if "agent.pt" in filenames:
118
+ compatible_models.append((model.modelId, model.sha))
119
+
120
+ logger.info(f"Found {len(compatible_models)} compatible models")
121
+
122
+ # Get the results
123
+ snapshot_download(
124
+ repo_id=RESULTS_REPO,
125
+ revision="main",
126
+ local_dir=RESULTS_PATH,
127
+ repo_type="dataset",
128
+ max_workers=60,
129
+ token=TOKEN,
130
+ )
131
+ json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
132
+
133
+ evaluated_models = set()
134
+ for json_filepath in json_files:
135
+ with open(json_filepath) as fp:
136
+ data = json.load(fp)
137
+ evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))
138
+
139
+ # Find the models that are not associated with any results
140
+ pending_models = set(compatible_models) - evaluated_models
141
+ logger.info(f"Found {len(pending_models)} pending models")
142
+
143
+ # Run an evaluation on the models
144
+ for model_id, sha in pending_models:
145
+ logger.info(f"Running evaluation on {model_id}")
146
+ report = {"config": {"model_id": model_id, "model_sha": sha}}
147
+ evaluations = evaluate(model_id, revision=sha)
148
+ if evaluations is not None:
149
+ report["results"] = evaluations
150
+ report["status"] = "DONE"
151
+ else:
152
+ report["status"] = "FAILED"
153
+
154
+ # Update the results
155
+ dumped = json.dumps(report, indent=2)
156
+ output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
157
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
158
+ with open(output_path, "w") as f:
159
+ f.write(dumped)
160
+
161
+ # Upload the results to the results repo
162
+ API.upload_file(
163
+ path_or_fileobj=output_path,
164
+ path_in_repo=f"{model_id}/results_{sha}.json",
165
+ repo_id=RESULTS_REPO,
166
+ repo_type="dataset",
167
+ )
168
+
169
+
170
+ def backend_routine():
171
+ try:
172
+ _backend_routine()
173
+ except Exception as e:
174
+ logger.error(f"{e.__class__.__name__}: {str(e)}")
175
+
176
+
177
+ def get_leaderboard_df():
178
+ snapshot_download(
179
+ repo_id=RESULTS_REPO,
180
+ revision="main",
181
+ local_dir=RESULTS_PATH,
182
+ repo_type="dataset",
183
+ max_workers=60,
184
+ token=TOKEN,
185
+ )
186
+
187
+ json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
188
+ data = []
189
+
190
+ for json_filepath in json_files:
191
+ with open(json_filepath) as fp:
192
+ report = json.load(fp)
193
+ model_id = report["config"]["model_id"]
194
+ row = {"Agent": model_id, "Status": report["status"]}
195
+ if report["status"] == "DONE":
196
+ results = {env_id: result["episodic_return"] for env_id, result in report["results"].items()}
197
+ row.update(results)
198
+ data.append(row)
199
+
200
+ # Create DataFrame
201
+ df = pd.DataFrame(data)
202
+ # Replace NaN values with empty strings
203
+ df = df.fillna("")
204
+ return df
205
+
206
+
207
+ TITLE = """
208
+ 🚀 Open RL Leaderboard
209
  """
210
 
211
+ INTRODUCTION_TEXT = """
212
+ Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models.
 
 
 
 
 
213
  """
214
 
215
+ ABOUT_TEXT = """
216
+ The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models.
217
+ """
218
 
 
 
 
219
 
220
+ def select_column(column_names, data):
221
+ column_names = [col for col in column_names if col in data.columns]
222
+ column_names = ["Agent"] + column_names # add model name column
223
+ df = data[column_names]
224
+
225
+ def check_row(row):
226
+ return not (row.drop("Agent") == "").all()
227
+
228
+ mask = df.apply(check_row, axis=1)
229
+ df = df[mask]
230
+ return df
231
 
 
232
 
233
  with gr.Blocks(js=dark_mode_gradio_js) as demo:
234
+ gr.HTML(TITLE)
235
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
236
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
237
+ with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
238
+ full_df = get_leaderboard_df()
239
+ hidden_df = gr.components.Dataframe(full_df, visible=False) # hidden dataframe
240
+
241
+ env_checkboxes = gr.components.CheckboxGroup(
242
+ label="Environments",
243
+ choices=ALL_ENV_IDS,
244
+ value=[ALL_ENV_IDS[0]],
245
+ interactive=True,
246
+ )
247
+ leaderboard = gr.components.Dataframe(select_column([ALL_ENV_IDS[0]], full_df))
248
+
249
+ # Events
250
+ env_checkboxes.change(select_column, [env_checkboxes, hidden_df], leaderboard)
251
+
252
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
253
+ gr.Markdown(ABOUT_TEXT)
254
 
 
255
 
256
+ scheduler = BackgroundScheduler()
257
+ scheduler.add_job(func=backend_routine, trigger="interval", seconds=30)
258
+ scheduler.start()
259
 
260
 
261
  if __name__ == "__main__":
262
+ demo.queue().launch() # server_name="0.0.0.0", show_error=True, server_port=7860)
main_backend_harness.py DELETED
@@ -1,102 +0,0 @@
1
- import logging
2
- import pprint
3
-
4
- from huggingface_hub import snapshot_download
5
-
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
- from src.backend.run_eval_suite_harness import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
- from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import (
13
- QUEUE_REPO,
14
- EVAL_REQUESTS_PATH_BACKEND,
15
- RESULTS_REPO,
16
- EVAL_RESULTS_PATH_BACKEND,
17
- DEVICE,
18
- API,
19
- LIMIT,
20
- TOKEN,
21
- )
22
- from src.about import Tasks, NUM_FEWSHOT
23
- from src.logging import setup_logger
24
-
25
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
26
-
27
- # logging.basicConfig(level=logging.ERROR)
28
- logger = setup_logger(__name__)
29
- pp = pprint.PrettyPrinter(width=80)
30
-
31
- PENDING_STATUS = "PENDING"
32
- RUNNING_STATUS = "RUNNING"
33
- FINISHED_STATUS = "FINISHED"
34
- FAILED_STATUS = "FAILED"
35
-
36
- snapshot_download(
37
- repo_id=RESULTS_REPO,
38
- revision="main",
39
- local_dir=EVAL_RESULTS_PATH_BACKEND,
40
- repo_type="dataset",
41
- max_workers=60,
42
- token=TOKEN,
43
- )
44
- snapshot_download(
45
- repo_id=QUEUE_REPO,
46
- revision="main",
47
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
48
- repo_type="dataset",
49
- max_workers=60,
50
- token=TOKEN,
51
- )
52
-
53
-
54
- def run_auto_eval():
55
- current_pending_status = [PENDING_STATUS]
56
-
57
- # pull the eval dataset from the hub and parse any eval requests
58
- # check completed evals and set them to finished
59
- check_completed_evals(
60
- api=API,
61
- checked_status=RUNNING_STATUS,
62
- completed_status=FINISHED_STATUS,
63
- failed_status=FAILED_STATUS,
64
- hf_repo=QUEUE_REPO,
65
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
66
- hf_repo_results=RESULTS_REPO,
67
- local_dir_results=EVAL_RESULTS_PATH_BACKEND,
68
- )
69
-
70
- # Get all eval request that are PENDING, if you want to run other evals, change this parameter
71
- eval_requests = get_eval_requests(
72
- job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
73
- )
74
- # Sort the evals by priority (first submitted first run)
75
- eval_requests = sort_models_by_priority(api=API, models=eval_requests)
76
-
77
- print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
78
-
79
- if len(eval_requests) == 0:
80
- return
81
-
82
- eval_request = eval_requests[0]
83
- logger.info(pp.pformat(eval_request))
84
-
85
- set_eval_request(
86
- api=API,
87
- eval_request=eval_request,
88
- set_to_status=RUNNING_STATUS,
89
- hf_repo=QUEUE_REPO,
90
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
91
- )
92
-
93
- run_evaluation(
94
- eval_request=eval_request,
95
- task_names=TASKS_HARNESS,
96
- local_dir=EVAL_RESULTS_PATH_BACKEND,
97
- results_repo=RESULTS_REPO,
98
- )
99
-
100
-
101
- if __name__ == "__main__":
102
- run_auto_eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/create_request_file.py DELETED
@@ -1,73 +0,0 @@
1
- import json
2
- import os
3
- import pprint
4
- from datetime import datetime, timezone
5
-
6
- import click
7
- from colorama import Fore
8
- from huggingface_hub import HfApi, snapshot_download
9
- from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
10
-
11
-
12
- def main():
13
- api = HfApi()
14
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
15
- snapshot_download(
16
- repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
17
- )
18
-
19
- model_name = click.prompt("Enter model name")
20
- revision = click.prompt("Enter revision", default="main")
21
- status = click.prompt("Enter status", default="FINISHED")
22
-
23
- try:
24
- model_info = api.model_info(repo_id=model_name, revision=revision)
25
- except Exception as e:
26
- print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
27
- return 1
28
-
29
- try:
30
- license = model_info.cardData["license"]
31
- except Exception:
32
- license = "?"
33
-
34
- eval_entry = {
35
- "model": model_name,
36
- "revision": revision,
37
- "status": status,
38
- "submitted_time": current_time,
39
- "likes": model_info.likes,
40
- "license": license,
41
- }
42
-
43
- user_name = ""
44
- model_path = model_name
45
- if "/" in model_name:
46
- user_name = model_name.split("/")[0]
47
- model_path = model_name.split("/")[1]
48
-
49
- pprint.pprint(eval_entry)
50
-
51
- if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
52
- click.echo("continuing...")
53
-
54
- out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
55
- os.makedirs(out_dir, exist_ok=True)
56
- out_path = f"{out_dir}/{model_path}_eval_request.json"
57
-
58
- with open(out_path, "w") as f:
59
- f.write(json.dumps(eval_entry))
60
-
61
- api.upload_file(
62
- path_or_fileobj=out_path,
63
- path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
64
- repo_id=QUEUE_REPO,
65
- repo_type="dataset",
66
- commit_message=f"Add {model_name} to eval queue",
67
- )
68
- else:
69
- click.echo("aborting...")
70
-
71
-
72
- if __name__ == "__main__":
73
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/fix_harness_import.py DELETED
@@ -1,11 +0,0 @@
1
- """This file should be used after pip install -r requirements.
2
- It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
- It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
- """
5
- import os
6
-
7
- import lm_eval
8
-
9
- if __name__ == "__main__":
10
- lm_eval_path = lm_eval.__path__[0]
11
- os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py DELETED
@@ -1,27 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
-
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
-
11
-
12
- # Change for your tasks here
13
- # ---------------------------------------------------
14
- class Tasks(Enum):
15
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- # task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
17
- task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
18
- task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
19
-
20
-
21
- NUM_FEWSHOT = 0 # Change with your few shot
22
-
23
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
24
- # ---------------------------------------------------
25
-
26
- TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
27
- # custom|myothertask|0|0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/manage_requests.py DELETED
@@ -1,107 +0,0 @@
1
- import glob
2
- import json
3
- from dataclasses import dataclass
4
- from typing import Optional
5
-
6
- from huggingface_hub import HfApi, snapshot_download
7
- from src.envs import TOKEN
8
- from src.logging import setup_logger
9
-
10
- logger = setup_logger(__name__)
11
-
12
-
13
- @dataclass
14
- class EvalRequest:
15
- model: str
16
- status: str
17
- json_filepath: str
18
- revision: str = "main" # commit
19
- submitted_time: Optional[
20
- str
21
- ] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
22
- likes: Optional[int] = 0
23
- license: Optional[str] = ""
24
-
25
-
26
- def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
27
- """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
28
- json_filepath = eval_request.json_filepath
29
-
30
- with open(json_filepath) as fp:
31
- data = json.load(fp)
32
-
33
- data["status"] = set_to_status
34
-
35
- with open(json_filepath, "w") as f:
36
- f.write(json.dumps(data))
37
-
38
- api.upload_file(
39
- path_or_fileobj=json_filepath,
40
- path_in_repo=json_filepath.replace(local_dir, ""),
41
- repo_id=hf_repo,
42
- repo_type="dataset",
43
- )
44
-
45
-
46
- def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
47
- """Get all pending evaluation requests and return a list in which private
48
- models appearing first, followed by public models sorted by the number of
49
- likes.
50
-
51
- Returns:
52
- `list[EvalRequest]`: a list of model info dicts.
53
- """
54
- snapshot_download(
55
- repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
56
- )
57
- json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
58
-
59
- eval_requests = []
60
- for json_filepath in json_files:
61
- with open(json_filepath) as fp:
62
- data = json.load(fp)
63
- if data["status"] in job_status:
64
- data["json_filepath"] = json_filepath
65
- eval_request = EvalRequest(**data)
66
- eval_requests.append(eval_request)
67
-
68
- return eval_requests
69
-
70
-
71
- def check_completed_evals(
72
- api: HfApi,
73
- hf_repo: str,
74
- local_dir: str,
75
- checked_status: str,
76
- completed_status: str,
77
- failed_status: str,
78
- hf_repo_results: str,
79
- local_dir_results: str,
80
- ):
81
- """Checks if the currently running evals are completed, if yes, update their status on the hub."""
82
- snapshot_download(
83
- repo_id=hf_repo_results,
84
- revision="main",
85
- local_dir=local_dir_results,
86
- repo_type="dataset",
87
- max_workers=60,
88
- token=TOKEN,
89
- )
90
-
91
- running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
92
-
93
- for eval_request in running_evals:
94
- model = eval_request.model
95
- logger.info("====================================")
96
- logger.info(f"Checking {model}")
97
-
98
- output_path = model
99
- output_file = f"{local_dir_results}/{output_path}/results*.json"
100
- output_file_exists = len(glob.glob(output_file)) > 0
101
-
102
- if output_file_exists:
103
- logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
104
- set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
105
- else:
106
- logger.info(f"No result file found for {model} setting it to {failed_status}")
107
- set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/run_eval_suite_harness.py DELETED
@@ -1,91 +0,0 @@
1
- import json
2
- import os
3
- import logging
4
- from datetime import datetime
5
-
6
- from src.envs import RESULTS_REPO, API
7
- from src.backend.manage_requests import EvalRequest
8
- from src.logging import setup_logger
9
- import fnmatch
10
- import torch
11
- from torch import nn
12
- from huggingface_hub.utils._errors import EntryNotFoundError
13
-
14
- import gymnasium as gym
15
-
16
-
17
- import numpy as np
18
- from typing import List
19
- from huggingface_hub import hf_hub_download
20
- from src.backend.manage_requests import EvalRequest
21
-
22
- logging.getLogger("openai").setLevel(logging.WARNING)
23
- logger = setup_logger(__name__)
24
-
25
-
26
- def pattern_match(patterns, source_list):
27
- if isinstance(patterns, str):
28
- patterns = [patterns]
29
-
30
- task_names = set()
31
- for pattern in patterns:
32
- for matching in fnmatch.filter(source_list, pattern):
33
- task_names.add(matching)
34
- return sorted(list(task_names))
35
-
36
-
37
- def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
38
- tags = API.model_info(eval_request.model).tags
39
- task_names = pattern_match(tags, task_names)
40
-
41
- logger.info(f"Selected Tasks: {task_names}")
42
-
43
- results = {
44
- "config": {
45
- "model_name": eval_request.model,
46
- "model_sha": eval_request.revision,
47
- },
48
- "results": {},
49
- }
50
- try:
51
- agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
52
- except EntryNotFoundError:
53
- logger.error("Agent not found")
54
- return
55
- agent = torch.jit.load(agent_path)
56
-
57
- episodic_rewards = []
58
- for task_name in task_names:
59
- env = gym.make(task_name)
60
- for _ in range(10):
61
- episodic_reward = 0.0
62
- observation, info = env.reset()
63
- done = False
64
- while not done:
65
- torch_observation = torch.from_numpy(np.array([observation]))
66
- action = agent(torch_observation).numpy()[0]
67
- observation, reward, terminated, truncated, info = env.step(action)
68
- done = terminated or truncated
69
- episodic_reward += reward
70
-
71
- episodic_rewards.append(episodic_reward)
72
-
73
- mean_reward = np.mean(episodic_rewards)
74
- results[task_name] = {"episodic_return": mean_reward}
75
-
76
- dumped = json.dumps(results, indent=2)
77
- logger.info(dumped)
78
-
79
- output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
80
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
81
- with open(output_path, "w") as f:
82
- f.write(dumped)
83
-
84
- API.upload_file(
85
- path_or_fileobj=output_path,
86
- path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
87
- repo_id=results_repo,
88
- repo_type="dataset",
89
- )
90
-
91
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/sort_queue.py DELETED
@@ -1,23 +0,0 @@
1
- import re
2
- from dataclasses import dataclass
3
-
4
- from huggingface_hub import HfApi
5
-
6
- from src.backend.manage_requests import EvalRequest
7
-
8
-
9
- @dataclass
10
- class ModelMetadata:
11
- likes: int = 0
12
-
13
-
14
- def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
15
- return sort_by_submit_date(models)
16
-
17
-
18
- def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
19
- return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
20
-
21
-
22
- def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
23
- return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{display/css_html_js.py → css_html_js.py} RENAMED
File without changes
src/display/log_visualizer.py DELETED
@@ -1,40 +0,0 @@
1
- from io import StringIO
2
- from pathlib import Path
3
-
4
- from bs4 import BeautifulSoup
5
- from rich.console import Console
6
- from rich.syntax import Syntax
7
-
8
- from src.display.css_html_js import style_content
9
- from src.envs import NUM_LINES_VISUALIZE
10
- from src.logging import log_file
11
-
12
-
13
- def log_file_to_html_string(reverse=True):
14
- with open(log_file, "rt") as f:
15
- lines = f.readlines()
16
- lines = lines[-NUM_LINES_VISUALIZE:]
17
-
18
- if reverse:
19
- lines = reversed(lines)
20
-
21
- output = "".join(lines)
22
- syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
23
-
24
- console = Console(record=True, width=150, style="#272822", file=StringIO())
25
- console.print(syntax)
26
- html_content = console.export_html(inline_styles=True)
27
-
28
- # Parse the HTML content using BeautifulSoup
29
- soup = BeautifulSoup(html_content, "lxml")
30
-
31
- # Modify the <pre> tag and add custom styles
32
- pre_tag = soup.pre
33
- pre_tag["class"] = "scrollable"
34
- del pre_tag["style"]
35
-
36
- # Add your custom styles and the .scrollable CSS to the <style> tag
37
- style_tag = soup.style
38
- style_tag.append(style_content)
39
-
40
- return soup.prettify()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -8,8 +8,8 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
  OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
10
 
11
- # For harness evaluations
12
- DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
  LIMIT = 20 # !!!! Should be None for actual evaluations!!!
14
 
15
  # For lighteval evaluations
@@ -19,17 +19,13 @@ VENDOR = "aws"
19
  # ----------------------------------
20
 
21
  REPO_ID = f"{OWNER}/backend"
22
- QUEUE_REPO = f"{OWNER}/requests"
23
  RESULTS_REPO = f"{OWNER}/results"
24
 
25
  # If you setup a cache later, just change HF_HOME
26
  CACHE_PATH = os.getenv("HF_HOME", ".")
27
 
28
  # Local caches
29
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
30
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
31
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
32
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
33
 
34
  REFRESH_RATE = 1 * 60 # 1 min
35
  NUM_LINES_VISUALIZE = 300
 
8
 
9
  OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
10
 
11
+ # For evaluations
12
+ DEVICE = "cpu" # "cuda:0" if you add compute, for evaluations
13
  LIMIT = 20 # !!!! Should be None for actual evaluations!!!
14
 
15
  # For lighteval evaluations
 
19
  # ----------------------------------
20
 
21
  REPO_ID = f"{OWNER}/backend"
 
22
  RESULTS_REPO = f"{OWNER}/results"
23
 
24
  # If you setup a cache later, just change HF_HOME
25
  CACHE_PATH = os.getenv("HF_HOME", ".")
26
 
27
  # Local caches
28
+ RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
 
29
 
30
  REFRESH_RATE = 1 * 60 # 1 min
31
  NUM_LINES_VISUALIZE = 300
src/logging.py CHANGED
@@ -1,4 +1,3 @@
1
- import sys
2
  from pathlib import Path
3
 
4
  proj_dir = Path(__file__).parents[1]
 
 
1
  from pathlib import Path
2
 
3
  proj_dir = Path(__file__).parents[1]
src/populate.py DELETED
@@ -1,56 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- raw_data = get_raw_eval_results(results_path, requests_path)
13
- all_data_json = [v.to_dict() for v in raw_data]
14
-
15
- df = pd.DataFrame.from_records(all_data_json)
16
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
- df = df[cols].round(decimals=2)
18
-
19
- # filter out if any of the benchmarks have not been produced
20
- df = df[has_no_nan_values(df, benchmark_cols)]
21
- return raw_data, df
22
-
23
-
24
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
25
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
26
- all_evals = []
27
-
28
- for entry in entries:
29
- if ".json" in entry:
30
- file_path = os.path.join(save_path, entry)
31
- with open(file_path) as fp:
32
- data = json.load(fp)
33
-
34
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
35
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
36
-
37
- all_evals.append(data)
38
- elif ".md" not in entry:
39
- # this is a folder
40
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
41
- for sub_entry in sub_entries:
42
- file_path = os.path.join(save_path, entry, sub_entry)
43
- with open(file_path) as fp:
44
- data = json.load(fp)
45
-
46
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
47
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
48
- all_evals.append(data)
49
-
50
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
51
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
52
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
53
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
54
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
55
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
56
- return df_finished[cols], df_running[cols], df_pending[cols]