Clémentine commited on
Commit
6e56e0d
·
1 Parent(s): c212cb7

reorg to simplify nav in code base

Browse files
app.py CHANGED
@@ -17,16 +17,17 @@ from src.assets.text_content import (
17
  LLM_BENCHMARKS_TEXT,
18
  TITLE,
19
  )
20
- from src.display_models.plot_results import (
21
  create_metric_plot_obj,
22
  create_scores_df,
23
  create_plot_df,
24
  join_model_info_with_results,
25
  HUMAN_BASELINES,
26
  )
27
- from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
28
- from src.display_models.modelcard_filter import check_model_card
29
- from src.display_models.utils import (
 
30
  AutoEvalColumn,
31
  EvalQueueColumn,
32
  fields,
@@ -35,8 +36,8 @@ from src.display_models.utils import (
35
  styled_warning,
36
  )
37
  from src.manage_collections import update_collections
38
- from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub
39
- from src.rate_limiting import user_submission_permission
40
 
41
  pd.set_option("display.precision", 1)
42
 
@@ -127,14 +128,8 @@ def add_new_eval(
127
  return styled_error("Please select a model type.")
128
 
129
  # Is the user rate limited?
130
- num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
131
- if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
132
- error_msg = f"Organisation or user `{model.split('/')[0]}`"
133
- error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
134
- error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
135
- error_msg += (
136
- "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
137
- )
138
  return styled_error(error_msg)
139
 
140
  # Did the model authors forbid its submission to the leaderboard?
@@ -155,28 +150,19 @@ def add_new_eval(
155
  if not model_on_hub:
156
  return styled_error(f'Model "{model}" {error}')
157
 
158
- model_info = api.model_info(repo_id=model, revision=revision)
159
-
160
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
161
  try:
162
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
163
- except AttributeError:
164
- try:
165
- size_match = re.search(size_pattern, model.lower())
166
- model_size = size_match.group(0)
167
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
168
- except AttributeError:
169
- return 65
170
-
171
- size_factor = 8 if (precision == "GPTQ" or "GPTQ" in model) else 1
172
- model_size = size_factor * model_size
173
 
 
174
  try:
175
  license = model_info.cardData["license"]
176
  except Exception:
177
- license = "?"
178
 
179
- # Were the model card and license filled?
180
  modelcard_OK, error_msg = check_model_card(model)
181
  if not modelcard_OK:
182
  return styled_error(error_msg)
@@ -279,13 +265,13 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
279
 
280
  NUMERIC_INTERVALS = {
281
  "?": pd.Interval(-1, 0, closed="right"),
282
- "0~1.5": pd.Interval(0, 1.5, closed="right"),
283
- "1.5~3": pd.Interval(1.5, 3, closed="right"),
284
- "3~7": pd.Interval(3, 7, closed="right"),
285
- "7~13": pd.Interval(7, 13, closed="right"),
286
- "13~35": pd.Interval(13, 35, closed="right"),
287
- "35~60": pd.Interval(35, 60, closed="right"),
288
- "60+": pd.Interval(60, 10000, closed="right"),
289
  }
290
 
291
 
 
17
  LLM_BENCHMARKS_TEXT,
18
  TITLE,
19
  )
20
+ from src.plots.plot_results import (
21
  create_metric_plot_obj,
22
  create_scores_df,
23
  create_plot_df,
24
  join_model_info_with_results,
25
  HUMAN_BASELINES,
26
  )
27
+ from src.get_model_info.apply_metadata_to_df import DO_NOT_SUBMIT_MODELS, ModelType
28
+ from src.get_model_info.get_metadata_from_hub import get_model_size
29
+ from src.filters import check_model_card
30
+ from src.get_model_info.utils import (
31
  AutoEvalColumn,
32
  EvalQueueColumn,
33
  fields,
 
36
  styled_warning,
37
  )
38
  from src.manage_collections import update_collections
39
+ from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df
40
+ from src.filters import is_model_on_hub, user_submission_permission
41
 
42
  pd.set_option("display.precision", 1)
43
 
 
128
  return styled_error("Please select a model type.")
129
 
130
  # Is the user rate limited?
131
+ user_can_submit, error_msg = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA)
132
+ if not user_can_submit:
 
 
 
 
 
 
133
  return styled_error(error_msg)
134
 
135
  # Did the model authors forbid its submission to the leaderboard?
 
150
  if not model_on_hub:
151
  return styled_error(f'Model "{model}" {error}')
152
 
 
 
 
153
  try:
154
+ model_info = api.model_info(repo_id=model, revision=revision)
155
+ except Exception:
156
+ return styled_error("Could not get your model information. Please fill it up properly.")
157
+
158
+ model_size = get_model_size(model_info=model_info , precision= precision)
 
 
 
 
 
 
159
 
160
+ # Were the model card and license filled?
161
  try:
162
  license = model_info.cardData["license"]
163
  except Exception:
164
+ return styled_error("Please select a license for your model")
165
 
 
166
  modelcard_OK, error_msg = check_model_card(model)
167
  if not modelcard_OK:
168
  return styled_error(error_msg)
 
265
 
266
  NUMERIC_INTERVALS = {
267
  "?": pd.Interval(-1, 0, closed="right"),
268
+ "~1.5": pd.Interval(0, 2, closed="right"),
269
+ "~3": pd.Interval(2, 4, closed="right"),
270
+ "~7": pd.Interval(4, 9, closed="right"),
271
+ "~13": pd.Interval(9, 20, closed="right"),
272
+ "~35": pd.Interval(20, 45, closed="right"),
273
+ "~60": pd.Interval(45, 70, closed="right"),
274
+ "70+": pd.Interval(70, 10000, closed="right"),
275
  }
276
 
277
 
src/assets/hardcoded_evals.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
 
1
+ from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
src/assets/text_content.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.display_models.model_metadata_type import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
@@ -14,13 +14,14 @@ LLM_BENCHMARKS_TEXT = f"""
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
- {ModelType.PT.to_str(" : ")} model
18
- {ModelType.FT.to_str(" : ")} model
19
- {ModelType.IFT.to_str(" : ")} model
20
- {ModelType.RL.to_str(" : ")} model
 
21
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
22
 
23
- 🏴‍☠️ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
24
  (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
25
 
26
  ## How it works
 
1
+ from src.get_model_info.hardocded_metadata.types import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
 
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
+ {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
18
+ {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
19
+ Specific fine-tune subcategories (more adapted to chat):
20
+ {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
21
+ {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
22
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
23
 
24
+ "Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
25
  (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
26
 
27
  ## How it works
src/{display_models/modelcard_filter.py → filters.py} RENAMED
@@ -1,5 +1,8 @@
1
  import huggingface_hub
2
  from huggingface_hub import ModelCard
 
 
 
3
 
4
 
5
  # ht to @Wauplin, thank you for the snippet!
@@ -24,3 +27,40 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
24
  return False, "Please add a description to your model card, it is too short."
25
 
26
  return True, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import huggingface_hub
2
  from huggingface_hub import ModelCard
3
+ from transformers import AutoConfig
4
+
5
+ from datetime import datetime, timedelta, timezone
6
 
7
 
8
  # ht to @Wauplin, thank you for the snippet!
 
27
  return False, "Please add a description to your model card, it is too short."
28
 
29
  return True, ""
30
+
31
+
32
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
33
+ try:
34
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
35
+ return True, None
36
+
37
+ except ValueError:
38
+ return (
39
+ False,
40
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
41
+ )
42
+
43
+ except Exception:
44
+ return False, "was not found on hub!"
45
+
46
+
47
+ def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
48
+ org_or_user, _ = submission_name.split("/")
49
+ if org_or_user not in users_to_submission_dates:
50
+ return True, ""
51
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
52
+
53
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
54
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
55
+
56
+ num_models_submitted_in_period = len(submissions_after_timelimit)
57
+ if num_models_submitted_in_period > rate_limit_quota:
58
+ error_msg = f"Organisation or user `{org_or_user}`"
59
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
60
+ error_msg += f"in the last {rate_limit_period} days.\n"
61
+ error_msg += (
62
+ "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
63
+ )
64
+ return False, error_msg
65
+ return True, ""
66
+
src/{display_models/get_model_metadata.py → get_model_info/apply_metadata_to_df.py} RENAMED
@@ -6,9 +6,9 @@ from typing import List
6
  from huggingface_hub import HfApi
7
  from tqdm import tqdm
8
 
9
- from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
10
- from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
11
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
12
 
13
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
14
 
@@ -41,7 +41,7 @@ def get_model_metadata(leaderboard_data: List[dict]):
41
  request = json.load(f)
42
  model_type = model_type_from_str(request["model_type"])
43
  model_data[AutoEvalColumn.model_type.name] = model_type.value.name
44
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
45
  model_data[AutoEvalColumn.license.name] = request["license"]
46
  model_data[AutoEvalColumn.likes.name] = request["likes"]
47
  model_data[AutoEvalColumn.params.name] = request["params"]
 
6
  from huggingface_hub import HfApi
7
  from tqdm import tqdm
8
 
9
+ from src.get_model_info.hardocded_metadata.flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
10
+ from src.get_model_info.hardocded_metadata.types import MODEL_TYPE_METADATA, ModelType, model_type_from_str
11
+ from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
12
 
13
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
14
 
 
41
  request = json.load(f)
42
  model_type = model_type_from_str(request["model_type"])
43
  model_data[AutoEvalColumn.model_type.name] = model_type.value.name
44
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol
45
  model_data[AutoEvalColumn.license.name] = request["license"]
46
  model_data[AutoEvalColumn.likes.name] = request["likes"]
47
  model_data[AutoEvalColumn.params.name] = request["params"]
src/get_model_info/get_metadata_from_hub.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from huggingface_hub.hf_api import ModelInfo
3
+
4
+
5
+ def get_model_size(model_info: ModelInfo, precision: str):
6
+ size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
7
+ try:
8
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
9
+ except AttributeError:
10
+ try:
11
+ size_match = re.search(size_pattern, model_info.modelId.lower())
12
+ model_size = size_match.group(0)
13
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
14
+ except AttributeError:
15
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
16
+
17
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
18
+ model_size = size_factor * model_size
19
+ return model_size
src/{display_models/model_metadata_flags.py → get_model_info/hardocded_metadata/flags.py} RENAMED
File without changes
src/{display_models/model_metadata_type.py → get_model_info/hardocded_metadata/types.py} RENAMED
File without changes
src/{display_models → get_model_info}/utils.py RENAMED
File without changes
src/load_from_hub.py CHANGED
@@ -3,12 +3,11 @@ import os
3
  from collections import defaultdict
4
 
5
  import pandas as pd
6
- from transformers import AutoConfig
7
 
8
  from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
9
- from src.display_models.get_model_metadata import apply_metadata
10
- from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
11
- from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
12
 
13
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
14
 
@@ -90,17 +89,3 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
90
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
91
  return df_finished[cols], df_running[cols], df_pending[cols]
92
 
93
-
94
- def is_model_on_hub(model_name: str, revision: str) -> bool:
95
- try:
96
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
97
- return True, None
98
-
99
- except ValueError:
100
- return (
101
- False,
102
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
103
- )
104
-
105
- except Exception:
106
- return False, "was not found on hub!"
 
3
  from collections import defaultdict
4
 
5
  import pandas as pd
 
6
 
7
  from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
8
+ from src.get_model_info.apply_metadata_to_df import apply_metadata
9
+ from src.plots.read_results import get_eval_results_dicts, make_clickable_model
10
+ from src.get_model_info.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
11
 
12
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
13
 
 
89
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
90
  return df_finished[cols], df_running[cols], df_pending[cols]
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/manage_collections.py CHANGED
@@ -5,8 +5,8 @@ from requests.exceptions import HTTPError
5
  from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
6
  from huggingface_hub.utils._errors import HfHubHTTPError
7
 
8
- from src.display_models.model_metadata_type import ModelType
9
- from src.display_models.utils import AutoEvalColumn
10
 
11
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
12
 
 
5
  from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
6
  from huggingface_hub.utils._errors import HfHubHTTPError
7
 
8
+ from src.get_model_info.hardocded_metadata.types import ModelType
9
+ from src.get_model_info.utils import AutoEvalColumn
10
 
11
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
12
 
src/{display_models → plots}/plot_results.py RENAMED
@@ -4,7 +4,7 @@ from plotly.graph_objs import Figure
4
  import pickle
5
  from datetime import datetime, timezone
6
  from typing import List, Dict, Tuple, Any
7
- from src.display_models.model_metadata_flags import FLAGGED_MODELS
8
 
9
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
4
  import pickle
5
  from datetime import datetime, timezone
6
  from typing import List, Dict, Tuple, Any
7
+ from src.get_model_info.hardocded_metadata.flags import FLAGGED_MODELS
8
 
9
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
src/{display_models → plots}/read_results.py RENAMED
@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple
6
  import dateutil
7
  import numpy as np
8
 
9
- from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
@@ -31,7 +31,7 @@ class EvalResult:
31
  date: str = ""
32
 
33
  def to_dict(self):
34
- from src.load_from_hub import is_model_on_hub
35
 
36
  if self.org is not None:
37
  base_model = f"{self.org}/{self.model}"
 
6
  import dateutil
7
  import numpy as np
8
 
9
+ from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
 
31
  date: str = ""
32
 
33
  def to_dict(self):
34
+ from src.filters import is_model_on_hub
35
 
36
  if self.org is not None:
37
  base_model = f"{self.org}/{self.model}"
src/rate_limiting.py DELETED
@@ -1,13 +0,0 @@
1
- from datetime import datetime, timedelta, timezone
2
-
3
-
4
- def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
5
- org_or_user, _ = submission_name.split("/")
6
- if org_or_user not in users_to_submission_dates:
7
- return 0
8
- submission_dates = sorted(users_to_submission_dates[org_or_user])
9
-
10
- time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
11
- submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
12
-
13
- return len(submissions_after_timelimit)