Clémentine commited on
Commit
ed1fdef
1 Parent(s): 6fefae4

added 'forbidden models' submission, to allow orgs to request their models to not be submitted in case of contamination

Browse files
app.py CHANGED
@@ -10,7 +10,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
11
  from transformers import AutoConfig
12
 
13
- from src.auto_leaderboard.get_model_metadata import apply_metadata
14
  from src.assets.text_content import *
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
@@ -227,9 +227,13 @@ def add_new_eval(
227
  os.makedirs(OUT_DIR, exist_ok=True)
228
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
 
 
 
 
 
230
  # Check for duplicate submission
231
  if out_path.split("eval-queue/")[1].lower() in requested_models:
232
- return styled_warning("This model has been already submitted.")
233
 
234
  with open(out_path, "w") as f:
235
  f.write(json.dumps(eval_entry))
 
10
  from huggingface_hub import HfApi
11
  from transformers import AutoConfig
12
 
13
+ from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
14
  from src.assets.text_content import *
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 
227
  os.makedirs(OUT_DIR, exist_ok=True)
228
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
 
230
+ # Check if the model has been forbidden:
231
+ if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
232
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
233
+
234
  # Check for duplicate submission
235
  if out_path.split("eval-queue/")[1].lower() in requested_models:
236
+ return styled_warning("This model has been already submitted.")
237
 
238
  with open(out_path, "w") as f:
239
  f.write(json.dumps(eval_entry))
src/auto_leaderboard/get_model_metadata.py CHANGED
@@ -8,7 +8,7 @@ from tqdm import tqdm
8
 
9
  from src.utils_display import AutoEvalColumn, model_hyperlink
10
  from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
11
- from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS
12
 
13
  from huggingface_hub import HfApi
14
  import huggingface_hub
@@ -106,7 +106,18 @@ def flag_models(leaderboard_data:List[dict]):
106
  issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
107
  model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
108
 
 
 
 
 
 
 
 
 
 
 
109
  def apply_metadata(leaderboard_data: List[dict]):
 
110
  get_model_type(leaderboard_data)
111
  get_model_infos_from_hub(leaderboard_data)
112
  flag_models(leaderboard_data)
 
8
 
9
  from src.utils_display import AutoEvalColumn, model_hyperlink
10
  from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
11
+ from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS, DO_NOT_SUBMIT_MODELS
12
 
13
  from huggingface_hub import HfApi
14
  import huggingface_hub
 
106
  issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
107
  model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
108
 
109
+ def remove_forbidden_models(leaderboard_data: List[dict]):
110
+ indices_to_remove = []
111
+ for ix, model in enumerate(leaderboard_data):
112
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
113
+ indices_to_remove.append(ix)
114
+
115
+ for ix in reversed(indices_to_remove):
116
+ leaderboard_data.pop(ix)
117
+ return leaderboard_data
118
+
119
  def apply_metadata(leaderboard_data: List[dict]):
120
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
121
  get_model_type(leaderboard_data)
122
  get_model_infos_from_hub(leaderboard_data)
123
  flag_models(leaderboard_data)
src/auto_leaderboard/model_metadata_flags.py CHANGED
@@ -1,6 +1,12 @@
1
- # Model name to forum discussion id
 
2
  FLAGGED_MODELS = {
3
  "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
4
  "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
5
  "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
6
- }
 
 
 
 
 
 
1
+ # Models which have been flagged by users as being problematic for a reason or another
2
+ # (Model name to forum discussion link)
3
  FLAGGED_MODELS = {
4
  "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
  "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
  "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
+ }
8
+
9
+ # Models which have been requested by orgs to not be submitted on the leaderboard
10
+ DO_NOT_SUBMIT_MODELS = [
11
+ "Voicelab/trurl-2-13b", # trained on MMLU
12
+ ]