Spaces:
Paused
Paused
Clémentine
commited on
Commit
·
df66f6e
1
Parent(s):
bb17be3
refacto style + rate limit
Browse files- app.py +30 -22
- scripts/create_request_file.py +4 -3
- src/display/formatting.py +1 -0
- src/display/utils.py +2 -1
- src/envs.py +2 -0
- src/leaderboard/read_evals.py +9 -7
- src/populate.py +2 -2
- src/submission/check_validity.py +9 -4
- src/submission/submit.py +8 -8
- src/tools/collections.py +3 -3
- src/tools/plots.py +5 -3
app.py
CHANGED
@@ -6,18 +6,6 @@ import pandas as pd
|
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
from huggingface_hub import snapshot_download
|
8 |
|
9 |
-
from src.display.utils import (
|
10 |
-
COLS,
|
11 |
-
TYPES,
|
12 |
-
BENCHMARK_COLS,
|
13 |
-
EVAL_COLS,
|
14 |
-
EVAL_TYPES,
|
15 |
-
AutoEvalColumn,
|
16 |
-
ModelType,
|
17 |
-
NUMERIC_INTERVALS,
|
18 |
-
fields,
|
19 |
-
)
|
20 |
-
from src.display.css_html_js import custom_css, get_window_url_params
|
21 |
from src.display.about import (
|
22 |
CITATION_BUTTON_LABEL,
|
23 |
CITATION_BUTTON_TEXT,
|
@@ -26,17 +14,29 @@ from src.display.about import (
|
|
26 |
LLM_BENCHMARKS_TEXT,
|
27 |
TITLE,
|
28 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
from src.tools.plots import (
|
|
|
30 |
create_metric_plot_obj,
|
31 |
-
create_scores_df,
|
32 |
create_plot_df,
|
|
|
33 |
join_model_info_with_results,
|
34 |
-
HUMAN_BASELINES,
|
35 |
)
|
36 |
-
from src.tools.collections import update_collections
|
37 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
38 |
-
from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
|
39 |
-
from src.submission.submit import add_new_eval
|
40 |
|
41 |
|
42 |
def restart_space():
|
@@ -61,9 +61,9 @@ original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
61 |
update_collections(original_df.copy())
|
62 |
leaderboard_df = original_df.copy()
|
63 |
|
64 |
-
#models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
65 |
# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
66 |
-
#to_be_dumped = f"models = {repr(models)}\n"
|
67 |
|
68 |
(
|
69 |
finished_eval_queue_df,
|
@@ -173,8 +173,16 @@ with demo:
|
|
173 |
)
|
174 |
with gr.Row():
|
175 |
shown_columns = gr.CheckboxGroup(
|
176 |
-
choices=[
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
label="Select columns to show",
|
179 |
elem_id="column-select",
|
180 |
interactive=True,
|
|
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
from huggingface_hub import snapshot_download
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from src.display.about import (
|
10 |
CITATION_BUTTON_LABEL,
|
11 |
CITATION_BUTTON_TEXT,
|
|
|
14 |
LLM_BENCHMARKS_TEXT,
|
15 |
TITLE,
|
16 |
)
|
17 |
+
from src.display.css_html_js import custom_css, get_window_url_params
|
18 |
+
from src.display.utils import (
|
19 |
+
BENCHMARK_COLS,
|
20 |
+
COLS,
|
21 |
+
EVAL_COLS,
|
22 |
+
EVAL_TYPES,
|
23 |
+
NUMERIC_INTERVALS,
|
24 |
+
TYPES,
|
25 |
+
AutoEvalColumn,
|
26 |
+
ModelType,
|
27 |
+
fields,
|
28 |
+
)
|
29 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
30 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
+
from src.submission.submit import add_new_eval
|
32 |
+
from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
34 |
+
HUMAN_BASELINES,
|
35 |
create_metric_plot_obj,
|
|
|
36 |
create_plot_df,
|
37 |
+
create_scores_df,
|
38 |
join_model_info_with_results,
|
|
|
39 |
)
|
|
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
def restart_space():
|
|
|
61 |
update_collections(original_df.copy())
|
62 |
leaderboard_df = original_df.copy()
|
63 |
|
64 |
+
# models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
65 |
# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
66 |
+
# to_be_dumped = f"models = {repr(models)}\n"
|
67 |
|
68 |
(
|
69 |
finished_eval_queue_df,
|
|
|
173 |
)
|
174 |
with gr.Row():
|
175 |
shown_columns = gr.CheckboxGroup(
|
176 |
+
choices=[
|
177 |
+
c.name
|
178 |
+
for c in fields(AutoEvalColumn)
|
179 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
180 |
+
],
|
181 |
+
value=[
|
182 |
+
c.name
|
183 |
+
for c in fields(AutoEvalColumn)
|
184 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
185 |
+
],
|
186 |
label="Select columns to show",
|
187 |
elem_id="column-select",
|
188 |
interactive=True,
|
scripts/create_request_file.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
from datetime import datetime, timezone
|
2 |
import json
|
3 |
import os
|
|
|
4 |
import re
|
|
|
|
|
5 |
import click
|
6 |
-
from huggingface_hub import HfApi, snapshot_download
|
7 |
from colorama import Fore
|
8 |
-
import
|
9 |
|
10 |
EVAL_REQUESTS_PATH = "eval-queue"
|
11 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import pprint
|
4 |
import re
|
5 |
+
from datetime import datetime, timezone
|
6 |
+
|
7 |
import click
|
|
|
8 |
from colorama import Fore
|
9 |
+
from huggingface_hub import HfApi, snapshot_download
|
10 |
|
11 |
EVAL_REQUESTS_PATH = "eval-queue"
|
12 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
src/display/formatting.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
API = HfApi()
|
|
|
1 |
import os
|
2 |
+
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
API = HfApi()
|
src/display/utils.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
-
import pandas as pd
|
3 |
from enum import Enum
|
4 |
|
|
|
|
|
5 |
|
6 |
# These classes are for user facing column names,
|
7 |
# to avoid having to change them all around the code
|
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
|
7 |
# These classes are for user facing column names,
|
8 |
# to avoid having to change them all around the code
|
src/envs.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# clone / pull the lmeh eval data
|
@@ -24,5 +25,6 @@ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c796
|
|
24 |
# Rate limit variables
|
25 |
RATE_LIMIT_PERIOD = 7
|
26 |
RATE_LIMIT_QUOTA = 5
|
|
|
27 |
|
28 |
API = HfApi(token=H4_TOKEN)
|
|
|
1 |
import os
|
2 |
+
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# clone / pull the lmeh eval data
|
|
|
25 |
# Rate limit variables
|
26 |
RATE_LIMIT_PERIOD = 7
|
27 |
RATE_LIMIT_QUOTA = 5
|
28 |
+
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
29 |
|
30 |
API = HfApi(token=H4_TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,15 +1,15 @@
|
|
|
|
1 |
import json
|
2 |
-
import os
|
3 |
import math
|
4 |
-
import
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Dict, List, Tuple
|
7 |
|
8 |
import dateutil
|
9 |
import numpy as np
|
10 |
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks
|
12 |
from src.display.formatting import make_clickable_model
|
|
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
@@ -56,7 +56,9 @@ class EvalResult:
|
|
56 |
model = org_and_model[1]
|
57 |
result_key = f"{org}_{model}_{precision}"
|
58 |
|
59 |
-
still_on_hub = is_model_on_hub(
|
|
|
|
|
60 |
|
61 |
# Extract results available in this file (some results are split in several files)
|
62 |
results = {}
|
@@ -73,8 +75,8 @@ class EvalResult:
|
|
73 |
continue
|
74 |
|
75 |
# Some truthfulQA values are NaNs
|
76 |
-
if task.benchmark == "truthfulqa:mc" and
|
77 |
-
if math.isnan(float(data["results"][
|
78 |
results[task.benchmark] = 0.0
|
79 |
continue
|
80 |
|
@@ -191,7 +193,7 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
|
|
191 |
for v in eval_results.values():
|
192 |
try:
|
193 |
results.append(v.to_dict())
|
194 |
-
except KeyError:
|
195 |
continue
|
196 |
|
197 |
return results
|
|
|
1 |
+
import glob
|
2 |
import json
|
|
|
3 |
import math
|
4 |
+
import os
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Dict, List, Tuple
|
7 |
|
8 |
import dateutil
|
9 |
import numpy as np
|
10 |
|
|
|
11 |
from src.display.formatting import make_clickable_model
|
12 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
|
|
56 |
model = org_and_model[1]
|
57 |
result_key = f"{org}_{model}_{precision}"
|
58 |
|
59 |
+
still_on_hub = is_model_on_hub(
|
60 |
+
"/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True
|
61 |
+
)[0]
|
62 |
|
63 |
# Extract results available in this file (some results are split in several files)
|
64 |
results = {}
|
|
|
75 |
continue
|
76 |
|
77 |
# Some truthfulQA values are NaNs
|
78 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
79 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
80 |
results[task.benchmark] = 0.0
|
81 |
continue
|
82 |
|
|
|
193 |
for v in eval_results.values():
|
194 |
try:
|
195 |
results.append(v.to_dict())
|
196 |
+
except KeyError: # not all eval values present
|
197 |
continue
|
198 |
|
199 |
return results
|
src/populate.py
CHANGED
@@ -3,10 +3,10 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
|
|
6 |
from src.leaderboard.filter_models import filter_models
|
7 |
from src.leaderboard.read_evals import get_eval_results
|
8 |
-
from src.display.formatting import make_clickable_model, has_no_nan_values
|
9 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
from src.leaderboard.read_evals import get_eval_results
|
|
|
|
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
src/submission/check_validity.py
CHANGED
@@ -1,13 +1,15 @@
|
|
1 |
-
import huggingface_hub
|
2 |
-
import os
|
3 |
import json
|
|
|
4 |
import re
|
5 |
from collections import defaultdict
|
6 |
-
from
|
|
|
|
|
7 |
from huggingface_hub import ModelCard
|
|
|
8 |
from transformers import AutoConfig
|
9 |
|
10 |
-
from
|
11 |
|
12 |
|
13 |
# ht to @Wauplin, thank you for the snippet!
|
@@ -76,6 +78,9 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
|
|
76 |
submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
|
77 |
|
78 |
num_models_submitted_in_period = len(submissions_after_timelimit)
|
|
|
|
|
|
|
79 |
if num_models_submitted_in_period > rate_limit_quota:
|
80 |
error_msg = f"Organisation or user `{org_or_user}`"
|
81 |
error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
|
|
|
|
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import re
|
4 |
from collections import defaultdict
|
5 |
+
from datetime import datetime, timedelta, timezone
|
6 |
+
|
7 |
+
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
9 |
+
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig
|
11 |
|
12 |
+
from src.envs import HAS_HIGHER_RATE_LIMIT
|
13 |
|
14 |
|
15 |
# ht to @Wauplin, thank you for the snippet!
|
|
|
78 |
submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
|
79 |
|
80 |
num_models_submitted_in_period = len(submissions_after_timelimit)
|
81 |
+
if org_or_user in HAS_HIGHER_RATE_LIMIT:
|
82 |
+
rate_limit_quota = 2 * rate_limit_quota
|
83 |
+
|
84 |
if num_models_submitted_in_period > rate_limit_quota:
|
85 |
error_msg = f"Organisation or user `{org_or_user}`"
|
86 |
error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
|
src/submission/submit.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
from src.display.formatting import styled_error,
|
|
|
6 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
7 |
from src.submission.check_validity import (
|
8 |
-
user_submission_permission,
|
9 |
-
is_model_on_hub,
|
10 |
-
get_model_size,
|
11 |
-
check_model_card,
|
12 |
already_submitted_models,
|
|
|
|
|
|
|
|
|
13 |
)
|
14 |
-
from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
|
15 |
|
16 |
requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
|
17 |
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
8 |
from src.submission.check_validity import (
|
|
|
|
|
|
|
|
|
9 |
already_submitted_models,
|
10 |
+
check_model_card,
|
11 |
+
get_model_size,
|
12 |
+
is_model_on_hub,
|
13 |
+
user_submission_permission,
|
14 |
)
|
|
|
15 |
|
16 |
requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
|
17 |
|
src/tools/collections.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
import os
|
|
|
2 |
import pandas as pd
|
3 |
-
from
|
4 |
-
from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
|
5 |
from huggingface_hub.utils._errors import HfHubHTTPError
|
|
|
6 |
|
7 |
from src.display.utils import AutoEvalColumn, ModelType
|
8 |
-
|
9 |
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
10 |
|
11 |
# Specific intervals for the collections
|
|
|
1 |
import os
|
2 |
+
|
3 |
import pandas as pd
|
4 |
+
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
|
|
|
5 |
from huggingface_hub.utils._errors import HfHubHTTPError
|
6 |
+
from pandas import DataFrame
|
7 |
|
8 |
from src.display.utils import AutoEvalColumn, ModelType
|
|
|
9 |
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
10 |
|
11 |
# Specific intervals for the collections
|
src/tools/plots.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import plotly.express as px
|
3 |
from plotly.graph_objs import Figure
|
4 |
-
|
5 |
-
from datetime import datetime, timezone
|
6 |
-
from typing import List, Dict, Tuple, Any
|
7 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
8 |
|
9 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
|
|
1 |
+
import pickle
|
2 |
+
from datetime import datetime, timezone
|
3 |
+
from typing import Any, Dict, List, Tuple
|
4 |
+
|
5 |
import pandas as pd
|
6 |
import plotly.express as px
|
7 |
from plotly.graph_objs import Figure
|
8 |
+
|
|
|
|
|
9 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
10 |
|
11 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|