leaderboard

Running on CPU Upgrade

App Files Files Community

Quentin Gallouédec commited on Apr 5

Commit

74e3b17

•

1 Parent(s): e462d51

back and front!

Browse files

Files changed (15) hide show

.gitignore +3 -1
Makefile +6 -6
app.py +240 -40
main_backend_harness.py +0 -102
scripts/create_request_file.py +0 -73
scripts/fix_harness_import.py +0 -11
src/about.py +0 -27
src/backend/manage_requests.py +0 -107
src/backend/run_eval_suite_harness.py +0 -91
src/backend/sort_queue.py +0 -23
src/{display/css_html_js.py → css_html_js.py} +0 -0
src/display/log_visualizer.py +0 -40
src/envs.py +3 -7
src/logging.py +0 -1
src/populate.py +0 -56

.gitignore CHANGED Viewed

@@ -12,4 +12,6 @@ eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/
-output.log

 eval-queue-bk/
 eval-results-bk/
 logs/
+output.log
+env
+.DS_Store

Makefile CHANGED Viewed

@@ -2,12 +2,12 @@
 style:
-	python -m black --line-length 119 .
-	python -m isort .
-	ruff check --fix .
 quality:
-	python -m black --check --line-length 119 .
-	python -m isort --check-only .
-	ruff check .

 style:
+	python -m black --line-length 119 scripts src app.py
+	python -m isort scripts src app.py
+	ruff check --fix scripts src app.py
 quality:
+	python -m black --check --line-length 119 scripts src app.py
+	python -m isort --check-only scripts src app.py
+	ruff check scripts src app.py

app.py CHANGED Viewed

@@ -1,62 +1,262 @@
 import logging
-from src.logging import configure_root_logger
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("numexpr").setLevel(logging.WARNING)
-logging.getLogger("absl").setLevel(logging.WARNING)
-configure_root_logger()
-from functools import partial
-import gradio as gr
-from main_backend_harness import run_auto_eval
-from src.display.log_visualizer import log_file_to_html_string
-from src.display.css_html_js import dark_mode_gradio_js
-from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
-from src.logging import setup_logger, log_file
-logging.basicConfig(level=logging.INFO)
 logger = setup_logger(__name__)
-intro_md = f"""
-# Intro
-This is a visual for the auto evaluator.
 """
-links_md = f"""
-# Important links
-| Description     | Link |
-|-----------------|------|
-| Leaderboard     | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
-| Queue Repo      | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
-| Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
 """
-def button_auto_eval():
-    logger.info("Manually triggering Auto Eval")
-    run_auto_eval()
-reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
 with gr.Blocks(js=dark_mode_gradio_js) as demo:
-    gr.Markdown(intro_md)
-    with gr.Tab("Application"):
-        output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
-        with gr.Row():
-            download_button = gr.DownloadButton("Download Log File", value=log_file)
-            with gr.Accordion("Log View Configuration", open=False):
-                reverse_order_checkbox.render()
-        # Add a button that when pressed, triggers run_auto_eval
-        button = gr.Button("Manually Run Evaluation")
-        gr.Markdown(links_md)
-        dummy = gr.Markdown(run_auto_eval, every=REFRESH_RATE, visible=False)
-        button.click(fn=button_auto_eval, inputs=[], outputs=[])
 if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)

+import fnmatch
+import glob
+import json
 import logging
+import os
+import pprint
+import gradio as gr
+import gymnasium as gym
+import numpy as np
+import pandas as pd
+import torch
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub.utils._errors import EntryNotFoundError
+from src.css_html_js import dark_mode_gradio_js
+from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
+from src.logging import configure_root_logger, setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
+logger = setup_logger(__name__)
+configure_root_logger()
 logger = setup_logger(__name__)
+pp = pprint.PrettyPrinter(width=80)
+ALL_ENV_IDS = [
+    "CartPole-v1",
+    # "BreakoutNoFrameskip-v4",
+]
+def model_hyperlink(link, model_id):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
+def make_clickable_model(model_id):
+    link = f"https://huggingface.co/{model_id}"
+    return model_hyperlink(link, model_id)
+def pattern_match(patterns, source_list):
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    env_ids = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            env_ids.add(matching)
+    return sorted(list(env_ids))
+def evaluate(model_id, revision):
+    tags = API.model_info(model_id, revision=revision).tags
+    # Extract the environment IDs from the tags (usually only one)
+    env_ids = pattern_match(tags, ALL_ENV_IDS)
+    logger.info(f"Selected environments: {env_ids}")
+    results = {}
+    # Check if the agent exists
+    try:
+        agent_path = hf_hub_download(repo_id=model_id, filename="agent.pt")
+    except EntryNotFoundError:
+        logger.error("Agent not found")
+        return None
+    # Check safety
+    security = next(iter(API.list_files_info(model_id, "agent.pt", expand=True))).security
+    if security is None or "safe" not in security:
+        logger.error("Agent safety not available")
+        return None
+    elif not security["safe"]:
+        logger.error("Agent not safe")
+        return None
+    # Load the agent
+    try:
+        agent = torch.jit.load(agent_path)
+    except Exception as e:
+        logger.error(f"Error loading agent: {e}")
+        return None
+    # Evaluate the agent on the environments
+    for env_id in env_ids:
+        episodic_rewards = []
+        env = gym.make(env_id)
+        for _ in range(10):
+            episodic_reward = 0.0
+            observation, info = env.reset()
+            done = False
+            while not done:
+                torch_observation = torch.from_numpy(np.array([observation]))
+                action = agent(torch_observation).numpy()[0]
+                observation, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                episodic_reward += reward
+            episodic_rewards.append(episodic_reward)
+        mean_reward = np.mean(episodic_rewards)
+        results[env_id] = {"episodic_return": mean_reward}
+    return results
+def _backend_routine():
+    # List only the text classification models
+    rl_models = list(API.list_models(filter="reinforcement-learning"))
+    logger.info(f"Found {len(rl_models)} RL models")
+    compatible_models = []
+    for model in rl_models:
+        filenames = [sib.rfilename for sib in model.siblings]
+        if "agent.pt" in filenames:
+            compatible_models.append((model.modelId, model.sha))
+    logger.info(f"Found {len(compatible_models)} compatible models")
+    # Get the results
+    snapshot_download(
+        repo_id=RESULTS_REPO,
+        revision="main",
+        local_dir=RESULTS_PATH,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
+    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
+    evaluated_models = set()
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))
+    # Find the models that are not associated with any results
+    pending_models = set(compatible_models) - evaluated_models
+    logger.info(f"Found {len(pending_models)} pending models")
+    # Run an evaluation on the models
+    for model_id, sha in pending_models:
+        logger.info(f"Running evaluation on {model_id}")
+        report = {"config": {"model_id": model_id, "model_sha": sha}}
+        evaluations = evaluate(model_id, revision=sha)
+        if evaluations is not None:
+            report["results"] = evaluations
+            report["status"] = "DONE"
+        else:
+            report["status"] = "FAILED"
+        # Update the results
+        dumped = json.dumps(report, indent=2)
+        output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as f:
+            f.write(dumped)
+        # Upload the results to the results repo
+        API.upload_file(
+            path_or_fileobj=output_path,
+            path_in_repo=f"{model_id}/results_{sha}.json",
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+        )
+def backend_routine():
+    try:
+        _backend_routine()
+    except Exception as e:
+        logger.error(f"{e.__class__.__name__}: {str(e)}")
+def get_leaderboard_df():
+    snapshot_download(
+        repo_id=RESULTS_REPO,
+        revision="main",
+        local_dir=RESULTS_PATH,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
+    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
+    data = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            report = json.load(fp)
+        model_id = report["config"]["model_id"]
+        row = {"Agent": model_id, "Status": report["status"]}
+        if report["status"] == "DONE":
+            results = {env_id: result["episodic_return"] for env_id, result in report["results"].items()}
+            row.update(results)
+        data.append(row)
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    # Replace NaN values with empty strings
+    df = df.fillna("")
+    return df
+TITLE = """
+🚀 Open RL Leaderboard
 """
+INTRODUCTION_TEXT = """
+Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models.
 """
+ABOUT_TEXT = """
+The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models.
+"""
+def select_column(column_names, data):
+    column_names = [col for col in column_names if col in data.columns]
+    column_names = ["Agent"] + column_names  # add model name column
+    df = data[column_names]
+    def check_row(row):
+        return not (row.drop("Agent") == "").all()
+    mask = df.apply(check_row, axis=1)
+    df = df[mask]
+    return df
 with gr.Blocks(js=dark_mode_gradio_js) as demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
+            full_df = get_leaderboard_df()
+            hidden_df = gr.components.Dataframe(full_df, visible=False)  # hidden dataframe
+            env_checkboxes = gr.components.CheckboxGroup(
+                label="Environments",
+                choices=ALL_ENV_IDS,
+                value=[ALL_ENV_IDS[0]],
+                interactive=True,
+            )
+            leaderboard = gr.components.Dataframe(select_column([ALL_ENV_IDS[0]], full_df))
+            # Events
+            env_checkboxes.change(select_column, [env_checkboxes, hidden_df], leaderboard)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(ABOUT_TEXT)
+scheduler = BackgroundScheduler()
+scheduler.add_job(func=backend_routine, trigger="interval", seconds=30)
+scheduler.start()
 if __name__ == "__main__":
+    demo.queue().launch()  # server_name="0.0.0.0", show_error=True, server_port=7860)

main_backend_harness.py DELETED Viewed

@@ -1,102 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from src.backend.run_eval_suite_harness import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import (
-    QUEUE_REPO,
-    EVAL_REQUESTS_PATH_BACKEND,
-    RESULTS_REPO,
-    EVAL_RESULTS_PATH_BACKEND,
-    DEVICE,
-    API,
-    LIMIT,
-    TOKEN,
-)
-from src.about import Tasks, NUM_FEWSHOT
-from src.logging import setup_logger
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
-# logging.basicConfig(level=logging.ERROR)
-logger = setup_logger(__name__)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(
-    repo_id=RESULTS_REPO,
-    revision="main",
-    local_dir=EVAL_RESULTS_PATH_BACKEND,
-    repo_type="dataset",
-    max_workers=60,
-    token=TOKEN,
-)
-snapshot_download(
-    repo_id=QUEUE_REPO,
-    revision="main",
-    local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    repo_type="dataset",
-    max_workers=60,
-    token=TOKEN,
-)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(
-        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
-    )
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    logger.info(pp.pformat(eval_request))
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        results_repo=RESULTS_REPO,
-    )
-if __name__ == "__main__":
-    run_auto_eval()

scripts/create_request_file.py DELETED Viewed

@@ -1,73 +0,0 @@
-import json
-import os
-import pprint
-from datetime import datetime, timezone
-import click
-from colorama import Fore
-from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
-def main():
-    api = HfApi()
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(
-        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
-    )
-    model_name = click.prompt("Enter model name")
-    revision = click.prompt("Enter revision", default="main")
-    status = click.prompt("Enter status", default="FINISHED")
-    try:
-        model_info = api.model_info(repo_id=model_name, revision=revision)
-    except Exception as e:
-        print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
-        return 1
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        license = "?"
-    eval_entry = {
-        "model": model_name,
-        "revision": revision,
-        "status": status,
-        "submitted_time": current_time,
-        "likes": model_info.likes,
-        "license": license,
-    }
-    user_name = ""
-    model_path = model_name
-    if "/" in model_name:
-        user_name = model_name.split("/")[0]
-        model_path = model_name.split("/")[1]
-    pprint.pprint(eval_entry)
-    if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
-        click.echo("continuing...")
-        out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
-        os.makedirs(out_dir, exist_ok=True)
-        out_path = f"{out_dir}/{model_path}_eval_request.json"
-        with open(out_path, "w") as f:
-            f.write(json.dumps(eval_entry))
-        api.upload_file(
-            path_or_fileobj=out_path,
-            path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
-            repo_id=QUEUE_REPO,
-            repo_type="dataset",
-            commit_message=f"Add {model_name} to eval queue",
-        )
-    else:
-        click.echo("aborting...")
-if __name__ == "__main__":
-    main()

scripts/fix_harness_import.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""This file should be used after pip install -r requirements.
-It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
-It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
-"""
-import os
-import lm_eval
-if __name__ == "__main__":
-    lm_eval_path = lm_eval.__path__[0]
-    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/about.py DELETED Viewed

@@ -1,27 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Change for your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    # task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
-    task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
-    task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
-NUM_FEWSHOT = 0  # Change with your few shot
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
-# ---------------------------------------------------
-TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-# custom|myothertask|0|0

src/backend/manage_requests.py DELETED Viewed

@@ -1,107 +0,0 @@
-import glob
-import json
-from dataclasses import dataclass
-from typing import Optional
-from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN
-from src.logging import setup_logger
-logger = setup_logger(__name__)
-@dataclass
-class EvalRequest:
-    model: str
-    status: str
-    json_filepath: str
-    revision: str = "main"  # commit
-    submitted_time: Optional[
-        str
-    ] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
-    likes: Optional[int] = 0
-    license: Optional[str] = ""
-def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
-    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
-    json_filepath = eval_request.json_filepath
-    with open(json_filepath) as fp:
-        data = json.load(fp)
-    data["status"] = set_to_status
-    with open(json_filepath, "w") as f:
-        f.write(json.dumps(data))
-    api.upload_file(
-        path_or_fileobj=json_filepath,
-        path_in_repo=json_filepath.replace(local_dir, ""),
-        repo_id=hf_repo,
-        repo_type="dataset",
-    )
-def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
-    """Get all pending evaluation requests and return a list in which private
-    models appearing first, followed by public models sorted by the number of
-    likes.
-    Returns:
-        `list[EvalRequest]`: a list of model info dicts.
-    """
-    snapshot_download(
-        repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
-    )
-    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
-    eval_requests = []
-    for json_filepath in json_files:
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        if data["status"] in job_status:
-            data["json_filepath"] = json_filepath
-            eval_request = EvalRequest(**data)
-            eval_requests.append(eval_request)
-    return eval_requests
-def check_completed_evals(
-    api: HfApi,
-    hf_repo: str,
-    local_dir: str,
-    checked_status: str,
-    completed_status: str,
-    failed_status: str,
-    hf_repo_results: str,
-    local_dir_results: str,
-):
-    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
-    snapshot_download(
-        repo_id=hf_repo_results,
-        revision="main",
-        local_dir=local_dir_results,
-        repo_type="dataset",
-        max_workers=60,
-        token=TOKEN,
-    )
-    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
-    for eval_request in running_evals:
-        model = eval_request.model
-        logger.info("====================================")
-        logger.info(f"Checking {model}")
-        output_path = model
-        output_file = f"{local_dir_results}/{output_path}/results*.json"
-        output_file_exists = len(glob.glob(output_file)) > 0
-        if output_file_exists:
-            logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
-            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
-        else:
-            logger.info(f"No result file found for {model} setting it to {failed_status}")
-            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite_harness.py DELETED Viewed

@@ -1,91 +0,0 @@
-import json
-import os
-import logging
-from datetime import datetime
-from src.envs import RESULTS_REPO, API
-from src.backend.manage_requests import EvalRequest
-from src.logging import setup_logger
-import fnmatch
-import torch
-from torch import nn
-from huggingface_hub.utils._errors import EntryNotFoundError
-import gymnasium as gym
-import numpy as np
-from typing import List
-from huggingface_hub import hf_hub_download
-from src.backend.manage_requests import EvalRequest
-logging.getLogger("openai").setLevel(logging.WARNING)
-logger = setup_logger(__name__)
-def pattern_match(patterns, source_list):
-    if isinstance(patterns, str):
-        patterns = [patterns]
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return sorted(list(task_names))
-def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
-    tags = API.model_info(eval_request.model).tags
-    task_names = pattern_match(tags, task_names)
-    logger.info(f"Selected Tasks: {task_names}")
-    results = {
-        "config": {
-            "model_name": eval_request.model,
-            "model_sha": eval_request.revision,
-        },
-        "results": {},
-    }
-    try:
-        agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
-    except EntryNotFoundError:
-        logger.error("Agent not found")
-        return
-    agent = torch.jit.load(agent_path)
-    episodic_rewards = []
-    for task_name in task_names:
-        env = gym.make(task_name)
-        for _ in range(10):
-            episodic_reward = 0.0
-            observation, info = env.reset()
-            done = False
-            while not done:
-                torch_observation = torch.from_numpy(np.array([observation]))
-                action = agent(torch_observation).numpy()[0]
-                observation, reward, terminated, truncated, info = env.step(action)
-                done = terminated or truncated
-                episodic_reward += reward
-            episodic_rewards.append(episodic_reward)
-        mean_reward = np.mean(episodic_rewards)
-        results[task_name] = {"episodic_return": mean_reward}
-    dumped = json.dumps(results, indent=2)
-    logger.info(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(dumped)
-    API.upload_file(
-        path_or_fileobj=output_path,
-        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-        repo_id=results_repo,
-        repo_type="dataset",
-    )
-    return results

src/backend/sort_queue.py DELETED Viewed

@@ -1,23 +0,0 @@
-import re
-from dataclasses import dataclass
-from huggingface_hub import HfApi
-from src.backend.manage_requests import EvalRequest
-@dataclass
-class ModelMetadata:
-    likes: int = 0
-def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
-    return sort_by_submit_date(models)
-def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
-def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/{display/css_html_js.py → css_html_js.py} RENAMED Viewed

File without changes

src/display/log_visualizer.py DELETED Viewed

@@ -1,40 +0,0 @@
-from io import StringIO
-from pathlib import Path
-from bs4 import BeautifulSoup
-from rich.console import Console
-from rich.syntax import Syntax
-from src.display.css_html_js import style_content
-from src.envs import NUM_LINES_VISUALIZE
-from src.logging import log_file
-def log_file_to_html_string(reverse=True):
-    with open(log_file, "rt") as f:
-        lines = f.readlines()
-        lines = lines[-NUM_LINES_VISUALIZE:]
-    if reverse:
-        lines = reversed(lines)
-    output = "".join(lines)
-    syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
-    console = Console(record=True, width=150, style="#272822", file=StringIO())
-    console.print(syntax)
-    html_content = console.export_html(inline_styles=True)
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html_content, "lxml")
-    # Modify the <pre> tag and add custom styles
-    pre_tag = soup.pre
-    pre_tag["class"] = "scrollable"
-    del pre_tag["style"]
-    # Add your custom styles and the .scrollable CSS to the <style> tag
-    style_tag = soup.style
-    style_tag.append(style_content)
-    return soup.prettify()

src/envs.py CHANGED Viewed

@@ -8,8 +8,8 @@ TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
 OWNER = "open-rl-leaderboard"  # Change to your org - don't forget to create a results and request file
-# For harness evaluations
-DEVICE = "cpu"  # "cuda:0" if you add compute, for harness evaluations
 LIMIT = 20  # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
@@ -19,17 +19,13 @@ VENDOR = "aws"
 # ----------------------------------
 REPO_ID = f"{OWNER}/backend"
-QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 REFRESH_RATE = 1 * 60  # 1 min
 NUM_LINES_VISUALIZE = 300

 OWNER = "open-rl-leaderboard"  # Change to your org - don't forget to create a results and request file
+# For evaluations
+DEVICE = "cpu"  # "cuda:0" if you add compute, for evaluations
 LIMIT = 20  # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
 # ----------------------------------
 REPO_ID = f"{OWNER}/backend"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
+RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 REFRESH_RATE = 1 * 60  # 1 min
 NUM_LINES_VISUALIZE = 300

src/logging.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import sys
 from pathlib import Path
 proj_dir = Path(__file__).parents[1]



1	from pathlib import Path
2
3	proj_dir = Path(__file__).parents[1]

src/populate.py DELETED Viewed

@@ -1,56 +0,0 @@
-import json
-import os
-import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]