Nathan Habib commited on
Commit
0f4fbd6
β€’
2 Parent(s): 624b3c8 6e56e0d

Merge branch 'main' of https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard

Browse files
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import os
3
- import re
4
  from datetime import datetime, timezone
5
 
6
  import gradio as gr
@@ -17,9 +16,17 @@ from src.assets.text_content import (
17
  LLM_BENCHMARKS_TEXT,
18
  TITLE,
19
  )
20
- from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
21
- from src.display_models.modelcard_filter import check_model_card
22
- from src.display_models.utils import (
 
 
 
 
 
 
 
 
23
  AutoEvalColumn,
24
  EvalQueueColumn,
25
  fields,
@@ -27,8 +34,9 @@ from src.display_models.utils import (
27
  styled_message,
28
  styled_warning,
29
  )
30
- from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub
31
- from src.rate_limiting import user_submission_permission
 
32
 
33
  pd.set_option("display.precision", 1)
34
 
@@ -88,9 +96,11 @@ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="
88
  requested_models, users_to_submission_dates = get_all_requested_models(EVAL_REQUESTS_PATH)
89
 
90
  original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 
91
  leaderboard_df = original_df.copy()
92
 
93
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
 
94
  to_be_dumped = f"models = {repr(models)}\n"
95
 
96
  (
@@ -117,14 +127,8 @@ def add_new_eval(
117
  return styled_error("Please select a model type.")
118
 
119
  # Is the user rate limited?
120
- num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
121
- if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
122
- error_msg = f"Organisation or user `{model.split('/')[0]}`"
123
- error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
124
- error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
125
- error_msg += (
126
- "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard πŸ€—"
127
- )
128
  return styled_error(error_msg)
129
 
130
  # Did the model authors forbid its submission to the leaderboard?
@@ -145,28 +149,19 @@ def add_new_eval(
145
  if not model_on_hub:
146
  return styled_error(f'Model "{model}" {error}')
147
 
148
- model_info = api.model_info(repo_id=model, revision=revision)
149
-
150
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
151
  try:
152
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
153
- except AttributeError:
154
- try:
155
- size_match = re.search(size_pattern, model.lower())
156
- model_size = size_match.group(0)
157
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
158
- except AttributeError:
159
- return 65
160
-
161
- size_factor = 8 if (precision == "GPTQ" or "GPTQ" in model) else 1
162
- model_size = size_factor * model_size
163
 
 
 
 
164
  try:
165
  license = model_info.cardData["license"]
166
  except Exception:
167
- license = "?"
168
 
169
- # Were the model card and license filled?
170
  modelcard_OK, error_msg = check_model_card(model)
171
  if not modelcard_OK:
172
  return styled_error(error_msg)
@@ -269,13 +264,13 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
269
 
270
  NUMERIC_INTERVALS = {
271
  "?": pd.Interval(-1, 0, closed="right"),
272
- "0~1.5": pd.Interval(0, 1.5, closed="right"),
273
- "1.5~3": pd.Interval(1.5, 3, closed="right"),
274
- "3~7": pd.Interval(3, 7, closed="right"),
275
- "7~13": pd.Interval(7, 13, closed="right"),
276
- "13~35": pd.Interval(13, 35, closed="right"),
277
- "35~60": pd.Interval(35, 60, closed="right"),
278
- "60+": pd.Interval(60, 10000, closed="right"),
279
  }
280
 
281
 
@@ -513,6 +508,25 @@ with demo:
513
  leaderboard_table,
514
  queue=True,
515
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
517
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
518
 
 
1
  import json
2
  import os
 
3
  from datetime import datetime, timezone
4
 
5
  import gradio as gr
 
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
+ from src.plots.plot_results import (
20
+ create_metric_plot_obj,
21
+ create_scores_df,
22
+ create_plot_df,
23
+ join_model_info_with_results,
24
+ HUMAN_BASELINES,
25
+ )
26
+ from src.get_model_info.apply_metadata_to_df import DO_NOT_SUBMIT_MODELS, ModelType
27
+ from src.get_model_info.get_metadata_from_hub import get_model_size
28
+ from src.filters import check_model_card
29
+ from src.get_model_info.utils import (
30
  AutoEvalColumn,
31
  EvalQueueColumn,
32
  fields,
 
34
  styled_message,
35
  styled_warning,
36
  )
37
+ from src.manage_collections import update_collections
38
+ from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df
39
+ from src.filters import is_model_on_hub, user_submission_permission
40
 
41
  pd.set_option("display.precision", 1)
42
 
 
96
  requested_models, users_to_submission_dates = get_all_requested_models(EVAL_REQUESTS_PATH)
97
 
98
  original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
99
+ update_collections(original_df.copy())
100
  leaderboard_df = original_df.copy()
101
 
102
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
103
+ plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
104
  to_be_dumped = f"models = {repr(models)}\n"
105
 
106
  (
 
127
  return styled_error("Please select a model type.")
128
 
129
  # Is the user rate limited?
130
+ user_can_submit, error_msg = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA)
131
+ if not user_can_submit:
 
 
 
 
 
 
132
  return styled_error(error_msg)
133
 
134
  # Did the model authors forbid its submission to the leaderboard?
 
149
  if not model_on_hub:
150
  return styled_error(f'Model "{model}" {error}')
151
 
 
 
 
152
  try:
153
+ model_info = api.model_info(repo_id=model, revision=revision)
154
+ except Exception:
155
+ return styled_error("Could not get your model information. Please fill it up properly.")
 
 
 
 
 
 
 
 
156
 
157
+ model_size = get_model_size(model_info=model_info , precision= precision)
158
+
159
+ # Were the model card and license filled?
160
  try:
161
  license = model_info.cardData["license"]
162
  except Exception:
163
+ return styled_error("Please select a license for your model")
164
 
 
165
  modelcard_OK, error_msg = check_model_card(model)
166
  if not modelcard_OK:
167
  return styled_error(error_msg)
 
264
 
265
  NUMERIC_INTERVALS = {
266
  "?": pd.Interval(-1, 0, closed="right"),
267
+ "~1.5": pd.Interval(0, 2, closed="right"),
268
+ "~3": pd.Interval(2, 4, closed="right"),
269
+ "~7": pd.Interval(4, 9, closed="right"),
270
+ "~13": pd.Interval(9, 20, closed="right"),
271
+ "~35": pd.Interval(20, 45, closed="right"),
272
+ "~60": pd.Interval(45, 70, closed="right"),
273
+ "70+": pd.Interval(70, 10000, closed="right"),
274
  }
275
 
276
 
 
508
  leaderboard_table,
509
  queue=True,
510
  )
511
+
512
+ with gr.TabItem("πŸ“ˆ Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
513
+ with gr.Row():
514
+ with gr.Column():
515
+ chart = create_metric_plot_obj(
516
+ plot_df,
517
+ ["Average ⬆️"],
518
+ HUMAN_BASELINES,
519
+ title="Average of Top Scores and Human Baseline Over Time",
520
+ )
521
+ gr.Plot(value=chart, interactive=False, width=500, height=500)
522
+ with gr.Column():
523
+ chart = create_metric_plot_obj(
524
+ plot_df,
525
+ ["ARC", "HellaSwag", "MMLU", "TruthfulQA"],
526
+ HUMAN_BASELINES,
527
+ title="Top Scores and Human Baseline Over Time",
528
+ )
529
+ gr.Plot(value=chart, interactive=False, width=500, height=500)
530
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
531
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
532
 
requirements.txt CHANGED
@@ -19,13 +19,13 @@ ffmpy==0.3.0
19
  filelock==3.11.0
20
  fonttools==4.39.3
21
  frozenlist==1.3.3
22
- fsspec==2023.4.0
23
  gradio==3.43.2
24
  gradio-client==0.5.0
25
  h11==0.14.0
26
  httpcore==0.17.0
27
  httpx==0.24.0
28
- huggingface-hub==0.16.4
29
  idna==3.4
30
  Jinja2==3.1.2
31
  jsonschema==4.17.3
@@ -60,7 +60,7 @@ sniffio==1.3.0
60
  starlette==0.26.1
61
  toolz==0.12.0
62
  tqdm==4.65.0
63
- transformers==4.34.0
64
  typing_extensions==4.5.0
65
  tzdata==2023.3
66
  tzlocal==4.3
 
19
  filelock==3.11.0
20
  fonttools==4.39.3
21
  frozenlist==1.3.3
22
+ fsspec==2023.5.0
23
  gradio==3.43.2
24
  gradio-client==0.5.0
25
  h11==0.14.0
26
  httpcore==0.17.0
27
  httpx==0.24.0
28
+ huggingface-hub==0.18.0
29
  idna==3.4
30
  Jinja2==3.1.2
31
  jsonschema==4.17.3
 
60
  starlette==0.26.1
61
  toolz==0.12.0
62
  tqdm==4.65.0
63
+ transformers
64
  typing_extensions==4.5.0
65
  tzdata==2023.3
66
  tzlocal==4.3
src/assets/hardcoded_evals.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
 
1
+ from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
src/assets/text_content.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.display_models.model_metadata_type import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
4
 
@@ -14,13 +14,14 @@ LLM_BENCHMARKS_TEXT = f"""
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
- {ModelType.PT.to_str(" : ")} model
18
- {ModelType.FT.to_str(" : ")} model
19
- {ModelType.IFT.to_str(" : ")} model
20
- {ModelType.RL.to_str(" : ")} model
 
21
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
22
 
23
- πŸ΄β€β˜ οΈ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
24
  (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
25
 
26
  ## How it works
 
1
+ from src.get_model_info.hardocded_metadata.types import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
4
 
 
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
+ {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
18
+ {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
19
+ Specific fine-tune subcategories (more adapted to chat):
20
+ {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
21
+ {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
22
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
23
 
24
+ "Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
25
  (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
26
 
27
  ## How it works
src/{display_models/modelcard_filter.py β†’ filters.py} RENAMED
@@ -1,5 +1,8 @@
1
  import huggingface_hub
2
  from huggingface_hub import ModelCard
 
 
 
3
 
4
 
5
  # ht to @Wauplin, thank you for the snippet!
@@ -24,3 +27,40 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
24
  return False, "Please add a description to your model card, it is too short."
25
 
26
  return True, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import huggingface_hub
2
  from huggingface_hub import ModelCard
3
+ from transformers import AutoConfig
4
+
5
+ from datetime import datetime, timedelta, timezone
6
 
7
 
8
  # ht to @Wauplin, thank you for the snippet!
 
27
  return False, "Please add a description to your model card, it is too short."
28
 
29
  return True, ""
30
+
31
+
32
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
33
+ try:
34
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
35
+ return True, None
36
+
37
+ except ValueError:
38
+ return (
39
+ False,
40
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
41
+ )
42
+
43
+ except Exception:
44
+ return False, "was not found on hub!"
45
+
46
+
47
+ def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
48
+ org_or_user, _ = submission_name.split("/")
49
+ if org_or_user not in users_to_submission_dates:
50
+ return True, ""
51
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
52
+
53
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
54
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
55
+
56
+ num_models_submitted_in_period = len(submissions_after_timelimit)
57
+ if num_models_submitted_in_period > rate_limit_quota:
58
+ error_msg = f"Organisation or user `{org_or_user}`"
59
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
60
+ error_msg += f"in the last {rate_limit_period} days.\n"
61
+ error_msg += (
62
+ "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard πŸ€—"
63
+ )
64
+ return False, error_msg
65
+ return True, ""
66
+
src/{display_models/get_model_metadata.py β†’ get_model_info/apply_metadata_to_df.py} RENAMED
@@ -6,9 +6,9 @@ from typing import List
6
  from huggingface_hub import HfApi
7
  from tqdm import tqdm
8
 
9
- from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
10
- from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
11
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
12
 
13
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
14
 
@@ -45,10 +45,7 @@ def get_model_metadata(leaderboard_data: List[dict]):
45
  model_data[AutoEvalColumn.license.name] = request.get("license", "?")
46
  model_data[AutoEvalColumn.likes.name] = request.get("likes", 0)
47
  model_data[AutoEvalColumn.params.name] = request.get("params", 0)
48
- except Exception as e:
49
- print(f"Could not find request file for {model_data['model_name_for_query']}: {e}")
50
- print(f"{request_file=}")
51
- print(f"{request_files=}")
52
  if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
53
  model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
54
  model_data["model_name_for_query"]
 
6
  from huggingface_hub import HfApi
7
  from tqdm import tqdm
8
 
9
+ from src.get_model_info.hardocded_metadata.flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
10
+ from src.get_model_info.hardocded_metadata.types import MODEL_TYPE_METADATA, ModelType, model_type_from_str
11
+ from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
12
 
13
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
14
 
 
45
  model_data[AutoEvalColumn.license.name] = request.get("license", "?")
46
  model_data[AutoEvalColumn.likes.name] = request.get("likes", 0)
47
  model_data[AutoEvalColumn.params.name] = request.get("params", 0)
48
+ except Exception:
 
 
 
49
  if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
50
  model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
51
  model_data["model_name_for_query"]
src/get_model_info/get_metadata_from_hub.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from huggingface_hub.hf_api import ModelInfo
3
+
4
+
5
+ def get_model_size(model_info: ModelInfo, precision: str):
6
+ size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
7
+ try:
8
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
9
+ except AttributeError:
10
+ try:
11
+ size_match = re.search(size_pattern, model_info.modelId.lower())
12
+ model_size = size_match.group(0)
13
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
14
+ except AttributeError:
15
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
16
+
17
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
18
+ model_size = size_factor * model_size
19
+ return model_size
src/{display_models/model_metadata_flags.py β†’ get_model_info/hardocded_metadata/flags.py} RENAMED
File without changes
src/{display_models/model_metadata_type.py β†’ get_model_info/hardocded_metadata/types.py} RENAMED
File without changes
src/{display_models β†’ get_model_info}/utils.py RENAMED
File without changes
src/load_from_hub.py CHANGED
@@ -3,12 +3,11 @@ import os
3
  from collections import defaultdict
4
 
5
  import pandas as pd
6
- from transformers import AutoConfig
7
 
8
  from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
9
- from src.display_models.get_model_metadata import apply_metadata
10
- from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
11
- from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
12
 
13
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
14
 
@@ -90,17 +89,3 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
90
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
91
  return df_finished[cols], df_running[cols], df_pending[cols]
92
 
93
-
94
- def is_model_on_hub(model_name: str, revision: str) -> bool:
95
- try:
96
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
97
- return True, None
98
-
99
- except ValueError:
100
- return (
101
- False,
102
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
103
- )
104
-
105
- except Exception:
106
- return False, "was not found on hub!"
 
3
  from collections import defaultdict
4
 
5
  import pandas as pd
 
6
 
7
  from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
8
+ from src.get_model_info.apply_metadata_to_df import apply_metadata
9
+ from src.plots.read_results import get_eval_results_dicts, make_clickable_model
10
+ from src.get_model_info.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
11
 
12
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
13
 
 
89
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
90
  return df_finished[cols], df_running[cols], df_pending[cols]
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/manage_collections.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pandas import DataFrame
4
+ from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
5
+ from huggingface_hub.utils._errors import HfHubHTTPError
6
+
7
+ from src.get_model_info.hardocded_metadata.types import ModelType
8
+ from src.get_model_info.utils import AutoEvalColumn
9
+
10
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
11
+
12
+ path_to_collection = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
13
+ intervals = {
14
+ "1B": pd.Interval(0, 1.5, closed="right"),
15
+ "3B": pd.Interval(2.5, 3.5, closed="neither"),
16
+ "7B": pd.Interval(6, 8, closed="neither"),
17
+ "13B": pd.Interval(10, 14, closed="neither"),
18
+ "30B":pd.Interval(25, 35, closed="neither"),
19
+ "65B": pd.Interval(60, 70, closed="neither"),
20
+ }
21
+
22
+ def update_collections(df: DataFrame):
23
+ """This function updates the Open LLM Leaderboard model collection with the latest best models for
24
+ each size category and type.
25
+ """
26
+ collection = get_collection(collection_slug=path_to_collection, token=H4_TOKEN)
27
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
28
+
29
+ cur_best_models = []
30
+
31
+ ix = 0
32
+ for type in ModelType:
33
+ if type.value.name == "": continue
34
+ for size in intervals:
35
+ # We filter the df to gather the relevant models
36
+ type_emoji = [t[0] for t in type.value.symbol]
37
+ filtered_df = df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
38
+
39
+ numeric_interval = pd.IntervalIndex([intervals[size]])
40
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
41
+ filtered_df = filtered_df.loc[mask]
42
+
43
+ best_models = list(filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name])
44
+ print(type.value.symbol, size, best_models[:10])
45
+
46
+ # We add them one by one to the leaderboard
47
+ for model in best_models:
48
+ ix += 1
49
+ cur_len_collection = len(collection.items)
50
+ try:
51
+ collection = add_collection_item(
52
+ path_to_collection,
53
+ item_id=model,
54
+ item_type="model",
55
+ exists_ok=True,
56
+ note=f"Best {type.to_str(' ')} model of around {size} on the leaderboard today!",
57
+ token=H4_TOKEN
58
+ )
59
+ if len(collection.items) > cur_len_collection: # we added an item - we make sure its position is correct
60
+ item_object_id = collection.items[-1].item_object_id
61
+ update_collection_item(collection_slug=path_to_collection, item_object_id=item_object_id, position=ix)
62
+ cur_len_collection = len(collection.items)
63
+ cur_best_models.append(model)
64
+ break
65
+ except HfHubHTTPError:
66
+ continue
67
+
68
+ collection = get_collection(path_to_collection, token=H4_TOKEN)
69
+ for item in collection.items:
70
+ if item.item_id not in cur_best_models:
71
+ try:
72
+ delete_collection_item(collection_slug=path_to_collection, item_object_id=item.item_object_id, token=H4_TOKEN)
73
+ except HfHubHTTPError:
74
+ continue
75
+
src/plots/plot_results.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
+ from plotly.graph_objs import Figure
4
+ import pickle
5
+ from datetime import datetime, timezone
6
+ from typing import List, Dict, Tuple, Any
7
+ from src.get_model_info.hardocded_metadata.flags import FLAGGED_MODELS
8
+
9
+ # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
+ # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
11
+ # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
12
+ # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
13
+ # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
14
+ # Define the human baselines
15
+ HUMAN_BASELINES = {
16
+ "Average ⬆️": 0.897 * 100,
17
+ "ARC": 0.80 * 100,
18
+ "HellaSwag": 0.95 * 100,
19
+ "MMLU": 0.898 * 100,
20
+ "TruthfulQA": 0.94 * 100,
21
+ }
22
+
23
+
24
+ def to_datetime(model_info: Tuple[str, Any]) -> datetime:
25
+ """
26
+ Converts the lastModified attribute of the object to datetime.
27
+
28
+ :param model_info: A tuple containing the name and object.
29
+ The object must have a lastModified attribute
30
+ with a string representing the date and time.
31
+ :return: A datetime object converted from the lastModified attribute of the input object.
32
+ """
33
+ name, obj = model_info
34
+ return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
35
+
36
+
37
+ def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
38
+ """
39
+ Integrates model information with the results DataFrame by matching 'Model sha'.
40
+ :param results_df: A DataFrame containing results information including 'Model sha' column.
41
+ :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
42
+ """
43
+ # copy dataframe to avoid modifying the original
44
+ df = results_df.copy(deep=True)
45
+
46
+ # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
47
+ df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
48
+
49
+ # load cache from disk
50
+ try:
51
+ with open("model_info_cache.pkl", "rb") as f:
52
+ model_info_cache = pickle.load(f)
53
+ except (EOFError, FileNotFoundError):
54
+ model_info_cache = {}
55
+
56
+ # Sort date strings using datetime objects as keys
57
+ sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
58
+ df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
59
+
60
+ # Define the date format string
61
+ date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
62
+
63
+ # Iterate over sorted_dates and update the dataframe
64
+ for name, obj in sorted_dates:
65
+ # Convert the lastModified string to a datetime object
66
+ last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
67
+
68
+ # Update the "Results Date" column where "Model sha" equals obj.sha
69
+ df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
70
+ return df
71
+
72
+
73
+ def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
74
+ """
75
+ Generates a DataFrame containing the maximum scores until each result date.
76
+
77
+ :param results_df: A DataFrame containing result information including metric scores and result dates.
78
+ :return: A new DataFrame containing the maximum scores until each result date for every metric.
79
+ """
80
+ # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
81
+ results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
82
+ results_df.sort_values(by="Results Date", inplace=True)
83
+
84
+ # Step 2: Initialize the scores dictionary
85
+ scores = {
86
+ "Average ⬆️": [],
87
+ "ARC": [],
88
+ "HellaSwag": [],
89
+ "MMLU": [],
90
+ "TruthfulQA": [],
91
+ "Result Date": [],
92
+ "Model Name": [],
93
+ }
94
+
95
+ # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
96
+ for i, row in results_df.iterrows():
97
+ date = row["Results Date"]
98
+ for column in scores.keys():
99
+ if column == "Result Date":
100
+ if not scores[column] or scores[column][-1] <= date:
101
+ scores[column].append(date)
102
+ continue
103
+ if column == "Model Name":
104
+ scores[column].append(row["model_name_for_query"])
105
+ continue
106
+ current_max = scores[column][-1] if scores[column] else float("-inf")
107
+ scores[column].append(max(current_max, row[column]))
108
+
109
+ # Step 4: Convert the dictionary to a DataFrame
110
+ return pd.DataFrame(scores)
111
+
112
+
113
+ def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
114
+ """
115
+ Transforms the scores DataFrame into a new format suitable for plotting.
116
+
117
+ :param scores_df: A DataFrame containing metric scores and result dates.
118
+ :return: A new DataFrame reshaped for plotting purposes.
119
+ """
120
+ # Sample columns
121
+ cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
122
+
123
+ # Initialize the list to store DataFrames
124
+ dfs = []
125
+
126
+ # Iterate over the cols and create a new DataFrame for each column
127
+ for col in cols:
128
+ d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
129
+ d["Metric Name"] = col
130
+ d.rename(columns={col: "Metric Value"}, inplace=True)
131
+ dfs.append(d)
132
+
133
+ # Concatenate all the created DataFrames
134
+ concat_df = pd.concat(dfs, ignore_index=True)
135
+
136
+ # Sort values by 'Result Date'
137
+ concat_df.sort_values(by="Result Date", inplace=True)
138
+ concat_df.reset_index(drop=True, inplace=True)
139
+
140
+ # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
141
+ concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
142
+
143
+ concat_df.reset_index(drop=True, inplace=True)
144
+ return concat_df
145
+
146
+
147
+ def create_metric_plot_obj(
148
+ df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
149
+ ) -> Figure:
150
+ """
151
+ Create a Plotly figure object with lines representing different metrics
152
+ and horizontal dotted lines representing human baselines.
153
+
154
+ :param df: The DataFrame containing the metric values, names, and dates.
155
+ :param metrics: A list of strings representing the names of the metrics
156
+ to be included in the plot.
157
+ :param human_baselines: A dictionary where keys are metric names
158
+ and values are human baseline values for the metrics.
159
+ :param title: A string representing the title of the plot.
160
+ :return: A Plotly figure object with lines representing metrics and
161
+ horizontal dotted lines representing human baselines.
162
+ """
163
+
164
+ # Filter the DataFrame based on the specified metrics
165
+ df = df[df["Metric Name"].isin(metrics)]
166
+
167
+ # Filter the human baselines based on the specified metrics
168
+ filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
169
+
170
+ # Create a line figure using plotly express with specified markers and custom data
171
+ fig = px.line(
172
+ df,
173
+ x="Result Date",
174
+ y="Metric Value",
175
+ color="Metric Name",
176
+ markers=True,
177
+ custom_data=["Metric Name", "Metric Value", "Model Name"],
178
+ title=title,
179
+ )
180
+
181
+ # Update hovertemplate for better hover interaction experience
182
+ fig.update_traces(
183
+ hovertemplate="<br>".join(
184
+ [
185
+ "Model Name: %{customdata[2]}",
186
+ "Metric Name: %{customdata[0]}",
187
+ "Date: %{x}",
188
+ "Metric Value: %{y}",
189
+ ]
190
+ )
191
+ )
192
+
193
+ # Update the range of the y-axis
194
+ fig.update_layout(yaxis_range=[0, 100])
195
+
196
+ # Create a dictionary to hold the color mapping for each metric
197
+ metric_color_mapping = {}
198
+
199
+ # Map each metric name to its color in the figure
200
+ for trace in fig.data:
201
+ metric_color_mapping[trace.name] = trace.line.color
202
+
203
+ # Iterate over filtered human baselines and add horizontal lines to the figure
204
+ for metric, value in filtered_human_baselines.items():
205
+ color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
206
+ location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
207
+ # Add horizontal line with matched color and positioned annotation
208
+ fig.add_hline(
209
+ y=value,
210
+ line_dash="dot",
211
+ annotation_text=f"{metric} human baseline",
212
+ annotation_position=location,
213
+ annotation_font_size=10,
214
+ annotation_font_color=color,
215
+ line_color=color,
216
+ )
217
+
218
+ return fig
219
+
220
+
221
+ # Example Usage:
222
+ # human_baselines dictionary is defined.
223
+ # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
src/{display_models β†’ plots}/read_results.py RENAMED
@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple
6
  import dateutil
7
  import numpy as np
8
 
9
- from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
@@ -31,7 +31,7 @@ class EvalResult:
31
  date: str = ""
32
 
33
  def to_dict(self):
34
- from src.load_from_hub import is_model_on_hub
35
 
36
  if self.org is not None:
37
  base_model = f"{self.org}/{self.model}"
 
6
  import dateutil
7
  import numpy as np
8
 
9
+ from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
 
31
  date: str = ""
32
 
33
  def to_dict(self):
34
+ from src.filters import is_model_on_hub
35
 
36
  if self.org is not None:
37
  base_model = f"{self.org}/{self.model}"
src/rate_limiting.py DELETED
@@ -1,13 +0,0 @@
1
- from datetime import datetime, timedelta, timezone
2
-
3
-
4
- def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
5
- org_or_user, _ = submission_name.split("/")
6
- if org_or_user not in users_to_submission_dates:
7
- return 0
8
- submission_dates = sorted(users_to_submission_dates[org_or_user])
9
-
10
- time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
11
- submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
12
-
13
- return len(submissions_after_timelimit)