clefourrier HF staff commited on
Commit
6b9a0ec
1 Parent(s): 84fc6ef

nathan-flagged-models-vis (#478)

Browse files

- Adds a way to hide flagged models (a69dfa979897081c10a30f1be9937a917d93422b)
- remove unnused pprint import (1be35c2d1ffffab552d9d65f826930e4f9f1c273)
- remove unnused pprint import (6adc61160db982ce023039472b8842d21584b367)

(cherry picked from commit 460ecf2f9814163d447819d75dd51e4139b4476b)

Files changed (3) hide show
  1. app.py +16 -7
  2. src/display/utils.py +2 -0
  3. src/leaderboard/filter_models.py +18 -2
app.py CHANGED
@@ -33,7 +33,6 @@ from src.display.utils import (
33
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
34
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
35
  from src.submission.submit import add_new_eval
36
- from src.submission.check_validity import already_submitted_models
37
  from src.tools.collections import update_collections
38
  from src.tools.plots import (
39
  create_metric_plot_obj,
@@ -82,14 +81,20 @@ def update_table(
82
  precision_query: str,
83
  size_query: list,
84
  show_deleted: bool,
 
85
  query: str,
86
  ):
87
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
88
  filtered_df = filter_queries(query, filtered_df)
89
  df = select_columns(filtered_df, columns)
90
  return df
91
 
92
 
 
 
 
 
 
93
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
94
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
95
 
@@ -127,7 +132,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
127
 
128
 
129
  def filter_models(
130
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
131
  ) -> pd.DataFrame:
132
  # Show all models
133
  if show_deleted:
@@ -135,6 +140,9 @@ def filter_models(
135
  else: # Show only still on the hub models
136
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
137
 
 
 
 
138
  type_emoji = [t[0] for t in type_query]
139
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
140
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -146,6 +154,7 @@ def filter_models(
146
 
147
  return filtered_df
148
 
 
149
 
150
  import unicodedata
151
 
@@ -175,11 +184,11 @@ hidden_leaderboard_table_for_search = gr.components.Dataframe(
175
 
176
  def display(x, y):
177
  # Assuming df is your DataFrame
178
- for column in original_df.columns:
179
- if original_df[column].dtype == 'object':
180
- original_df[column] = original_df[column].apply(remove_invalid_unicode)
181
 
182
- subset_df = original_df[COLS]
183
  # Ensure the output directory exists
184
  #output_dir = 'output'
185
  #if not os.path.exists(output_dir):
 
33
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
34
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
35
  from src.submission.submit import add_new_eval
 
36
  from src.tools.collections import update_collections
37
  from src.tools.plots import (
38
  create_metric_plot_obj,
 
81
  precision_query: str,
82
  size_query: list,
83
  show_deleted: bool,
84
+ show_flagged: bool,
85
  query: str,
86
  ):
87
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
88
  filtered_df = filter_queries(query, filtered_df)
89
  df = select_columns(filtered_df, columns)
90
  return df
91
 
92
 
93
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
94
+ query = request.query_params.get("query") or ""
95
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
96
+
97
+
98
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
99
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
100
 
 
132
 
133
 
134
  def filter_models(
135
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
136
  ) -> pd.DataFrame:
137
  # Show all models
138
  if show_deleted:
 
140
  else: # Show only still on the hub models
141
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
142
 
143
+ if not show_flagged:
144
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
145
+
146
  type_emoji = [t[0] for t in type_query]
147
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
148
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
154
 
155
  return filtered_df
156
 
157
+ leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False)
158
 
159
  import unicodedata
160
 
 
184
 
185
  def display(x, y):
186
  # Assuming df is your DataFrame
187
+ for column in leaderboard_df.columns:
188
+ if leaderboard_df[column].dtype == 'object':
189
+ leaderboard_df[column] = leaderboard_df[column].apply(remove_invalid_unicode)
190
 
191
+ subset_df = leaderboard_df[COLS]
192
  # Ensure the output directory exists
193
  #output_dir = 'output'
194
  #if not os.path.exists(output_dir):
src/display/utils.py CHANGED
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
51
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
52
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
54
  # Dummy column for the search bar (hidden by the custom CSS)
55
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
56
 
@@ -80,6 +81,7 @@ baseline_row = {
80
  AutoEvalColumn.gsm8k.name: 0.21,
81
  AutoEvalColumn.dummy.name: "baseline",
82
  AutoEvalColumn.model_type.name: "",
 
83
  }
84
 
85
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
51
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
52
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
54
+ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
55
  # Dummy column for the search bar (hidden by the custom CSS)
56
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
57
 
 
81
  AutoEvalColumn.gsm8k.name: 0.21,
82
  AutoEvalColumn.dummy.name: "baseline",
83
  AutoEvalColumn.model_type.name: "",
84
+ AutoEvalColumn.flagged.name: False,
85
  }
86
 
87
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
src/leaderboard/filter_models.py CHANGED
@@ -13,13 +13,26 @@ FLAGGED_MODELS = {
13
  "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
14
  "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
  "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
- "TigerResearch/tigerbot-70b-chat-v4-4k": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/438",
17
- "TigerResearch/tigerbot-70b-chat-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/438"
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
  # Models which have been requested by orgs to not be submitted on the leaderboard
21
  DO_NOT_SUBMIT_MODELS = [
22
  "Voicelab/trurl-2-13b", # trained on MMLU
 
 
 
23
  ]
24
 
25
 
@@ -34,6 +47,9 @@ def flag_models(leaderboard_data: list[dict]):
34
  model_data[
35
  AutoEvalColumn.model.name
36
  ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
 
 
 
37
 
38
 
39
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
13
  "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
14
  "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
  "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
+ "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
17
+ "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
18
+ "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
19
+ "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
20
+ "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
21
+ "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
22
+ "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
23
+ "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
24
+ "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
25
+ "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
26
+ "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
27
+ "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
28
  }
29
 
30
  # Models which have been requested by orgs to not be submitted on the leaderboard
31
  DO_NOT_SUBMIT_MODELS = [
32
  "Voicelab/trurl-2-13b", # trained on MMLU
33
+ "TigerResearch/tigerbot-70b-chat", # per authors request
34
+ "TigerResearch/tigerbot-70b-chat-v2", # per authors request
35
+ "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
36
  ]
37
 
38
 
 
47
  model_data[
48
  AutoEvalColumn.model.name
49
  ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
50
+ model_data[AutoEvalColumn.flagged.name] = True
51
+ else:
52
+ model_data[AutoEvalColumn.flagged.name] = False
53
 
54
 
55
  def remove_forbidden_models(leaderboard_data: list[dict]):