alozowski commited on
Commit
b7d036c
1 Parent(s): b202e95

dummy column refactoring (#688)

Browse files

- collection update only happens on full initialization (d131b6cd68b918bb71c70060116d55dbd1d7e4be)
- removed dummy column (ecacc0f636e01ce3fa984e61cab8f4b0f5670af6)
- enhanced naming of dummy column (bab5ced191e1edf26b96473d5d42f51b0bd19784)

app.py CHANGED
@@ -82,10 +82,12 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
82
  def init_space(full_init: bool = True):
83
  """Initializes the application space, loading only necessary data."""
84
  if full_init:
 
85
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
86
  download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
87
  download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
88
 
 
89
  raw_data, original_df = get_leaderboard_df(
90
  results_path=EVAL_RESULTS_PATH,
91
  requests_path=EVAL_REQUESTS_PATH,
@@ -93,14 +95,18 @@ def init_space(full_init: bool = True):
93
  cols=COLS,
94
  benchmark_cols=BENCHMARK_COLS,
95
  )
96
- update_collections(original_df)
 
 
 
 
97
  leaderboard_df = original_df.copy()
98
 
 
99
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
100
 
101
  return leaderboard_df, raw_data, original_df, eval_queue_dfs
102
 
103
-
104
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
105
  # This controls whether a full initialization should be performed.
106
  do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
@@ -148,23 +154,17 @@ def load_query(request: gr.Request): # triggered only once at startup => read q
148
 
149
 
150
  def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
151
- return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False, na=False))]
152
-
153
 
154
  def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
155
  return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
156
 
157
-
158
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
159
  always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
160
- dummy_col = [AutoEvalColumn.dummy.name]
161
- # AutoEvalColumn.model_type_symbol.name,
162
- # AutoEvalColumn.model.name,
163
- # We use COLS to maintain sorting
164
  filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
165
  return filtered_df
166
 
167
-
168
  def filter_queries(query: str, df: pd.DataFrame):
169
  tmp_result_df = []
170
 
@@ -323,14 +323,13 @@ with demo:
323
  value=leaderboard_df[
324
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
325
  + shown_columns.value
326
- + [AutoEvalColumn.dummy.name]
327
  ],
328
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
329
  datatype=TYPES,
330
  elem_id="leaderboard-table",
331
  interactive=False,
332
  visible=True,
333
- # column_widths=["2%", "33%"]
334
  )
335
 
336
  # Dummy leaderboard for handling the case when the user uses backspace key
82
  def init_space(full_init: bool = True):
83
  """Initializes the application space, loading only necessary data."""
84
  if full_init:
85
+ # These downloads only occur on full initialization
86
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
87
  download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
88
  download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
89
 
90
+ # Always retrieve the leaderboard DataFrame
91
  raw_data, original_df = get_leaderboard_df(
92
  results_path=EVAL_RESULTS_PATH,
93
  requests_path=EVAL_REQUESTS_PATH,
95
  cols=COLS,
96
  benchmark_cols=BENCHMARK_COLS,
97
  )
98
+
99
+ if full_init:
100
+ # Collection update only happens on full initialization
101
+ update_collections(original_df)
102
+
103
  leaderboard_df = original_df.copy()
104
 
105
+ # Evaluation queue DataFrame retrieval is independent of initialization detail level
106
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
107
 
108
  return leaderboard_df, raw_data, original_df, eval_queue_dfs
109
 
 
110
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
111
  # This controls whether a full initialization should be performed.
112
  do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
154
 
155
 
156
  def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
157
+ return df[(df[AutoEvalColumn.fullname.name].str.contains(query, case=False, na=False))]
 
158
 
159
  def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
160
  return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
161
 
 
162
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
163
  always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
164
+ dummy_col = [AutoEvalColumn.fullname.name]
 
 
 
165
  filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
166
  return filtered_df
167
 
 
168
  def filter_queries(query: str, df: pd.DataFrame):
169
  tmp_result_df = []
170
 
323
  value=leaderboard_df[
324
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
325
  + shown_columns.value
326
+ + [AutoEvalColumn.fullname.name]
327
  ],
328
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
329
  datatype=TYPES,
330
  elem_id="leaderboard-table",
331
  interactive=False,
332
  visible=True,
 
333
  )
334
 
335
  # Dummy leaderboard for handling the case when the user uses backspace key
src/display/css_html_js.py CHANGED
@@ -1,4 +1,5 @@
1
  custom_css = """
 
2
  /* Hides the final AutoEvalColumn */
3
  #llm-benchmark-tab-table table td:last-child,
4
  #llm-benchmark-tab-table table th:last-child {
@@ -44,7 +45,7 @@ table th:first-child {
44
  background: none;
45
  border: none;
46
  }
47
-
48
  #search-bar {
49
  padding: 0px;
50
  }
@@ -94,4 +95,4 @@ get_window_url_params = """
94
  url_params = Object.fromEntries(params);
95
  return url_params;
96
  }
97
- """
1
  custom_css = """
2
+
3
  /* Hides the final AutoEvalColumn */
4
  #llm-benchmark-tab-table table td:last-child,
5
  #llm-benchmark-tab-table table th:last-child {
45
  background: none;
46
  border: none;
47
  }
48
+
49
  #search-bar {
50
  padding: 0px;
51
  }
95
  url_params = Object.fromEntries(params);
96
  return url_params;
97
  }
98
+ """
src/display/utils.py CHANGED
@@ -71,12 +71,13 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
71
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
72
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
73
  # Dummy column for the search bar (hidden by the custom CSS)
74
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
75
 
76
  # We use make dataclass to dynamically fill the scores from Tasks
77
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
78
 
79
 
 
80
  @dataclass(frozen=True)
81
  class EvalQueueColumn: # Queue column
82
  model = ColumnContent("model", "markdown", True)
@@ -99,7 +100,7 @@ baseline_row = {
99
  AutoEvalColumn.truthfulqa.name: 25.0,
100
  AutoEvalColumn.winogrande.name: 50.0,
101
  AutoEvalColumn.gsm8k.name: 0.21,
102
- AutoEvalColumn.dummy.name: "baseline",
103
  AutoEvalColumn.model_type.name: "",
104
  AutoEvalColumn.flagged.name: False,
105
  }
@@ -124,7 +125,7 @@ human_baseline_row = {
124
  AutoEvalColumn.truthfulqa.name: 94.0,
125
  AutoEvalColumn.winogrande.name: 94.0,
126
  AutoEvalColumn.gsm8k.name: 100,
127
- AutoEvalColumn.dummy.name: "human_baseline",
128
  AutoEvalColumn.model_type.name: "",
129
  AutoEvalColumn.flagged.name: False,
130
  }
71
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
72
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
73
  # Dummy column for the search bar (hidden by the custom CSS)
74
+ auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
75
 
76
  # We use make dataclass to dynamically fill the scores from Tasks
77
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
78
 
79
 
80
+
81
  @dataclass(frozen=True)
82
  class EvalQueueColumn: # Queue column
83
  model = ColumnContent("model", "markdown", True)
100
  AutoEvalColumn.truthfulqa.name: 25.0,
101
  AutoEvalColumn.winogrande.name: 50.0,
102
  AutoEvalColumn.gsm8k.name: 0.21,
103
+ AutoEvalColumn.fullname.name: "baseline",
104
  AutoEvalColumn.model_type.name: "",
105
  AutoEvalColumn.flagged.name: False,
106
  }
125
  AutoEvalColumn.truthfulqa.name: 94.0,
126
  AutoEvalColumn.winogrande.name: 94.0,
127
  AutoEvalColumn.gsm8k.name: 100,
128
+ AutoEvalColumn.fullname.name: "human_baseline",
129
  AutoEvalColumn.model_type.name: "",
130
  AutoEvalColumn.flagged.name: False,
131
  }
src/leaderboard/filter_models.py CHANGED
@@ -130,14 +130,17 @@ DO_NOT_SUBMIT_MODELS = [
130
 
131
 
132
  def flag_models(leaderboard_data: list[dict]):
 
133
  for model_data in leaderboard_data:
134
  # Merges and moes are flagged automatically
135
  if model_data[AutoEvalColumn.flagged.name]:
136
  flag_key = "merged"
137
  else:
138
- flag_key = model_data["model_name_for_query"]
139
 
 
140
  if flag_key in FLAGGED_MODELS:
 
141
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
142
  issue_link = model_hyperlink(
143
  FLAGGED_MODELS[flag_key],
@@ -152,11 +155,13 @@ def flag_models(leaderboard_data: list[dict]):
152
 
153
 
154
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
155
  indices_to_remove = []
156
  for ix, model in enumerate(leaderboard_data):
157
- if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
158
  indices_to_remove.append(ix)
159
 
 
160
  for ix in reversed(indices_to_remove):
161
  leaderboard_data.pop(ix)
162
  return leaderboard_data
@@ -165,3 +170,4 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
165
  def filter_models_flags(leaderboard_data: list[dict]):
166
  leaderboard_data = remove_forbidden_models(leaderboard_data)
167
  flag_models(leaderboard_data)
 
130
 
131
 
132
  def flag_models(leaderboard_data: list[dict]):
133
+ """Flags models based on external criteria or flagged status."""
134
  for model_data in leaderboard_data:
135
  # Merges and moes are flagged automatically
136
  if model_data[AutoEvalColumn.flagged.name]:
137
  flag_key = "merged"
138
  else:
139
+ flag_key = model_data[AutoEvalColumn.fullname.name]
140
 
141
+ print(f"model check: {flag_key}")
142
  if flag_key in FLAGGED_MODELS:
143
+ print(f"Flagged model: {flag_key}")
144
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
145
  issue_link = model_hyperlink(
146
  FLAGGED_MODELS[flag_key],
155
 
156
 
157
  def remove_forbidden_models(leaderboard_data: list[dict]):
158
+ """Removes models from the leaderboard based on the DO_NOT_SUBMIT list."""
159
  indices_to_remove = []
160
  for ix, model in enumerate(leaderboard_data):
161
+ if model[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
162
  indices_to_remove.append(ix)
163
 
164
+ # Remove the models from the list
165
  for ix in reversed(indices_to_remove):
166
  leaderboard_data.pop(ix)
167
  return leaderboard_data
170
  def filter_models_flags(leaderboard_data: list[dict]):
171
  leaderboard_data = remove_forbidden_models(leaderboard_data)
172
  flag_models(leaderboard_data)
173
+
src/leaderboard/read_evals.py CHANGED
@@ -133,7 +133,7 @@ class EvalResult:
133
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
134
  AutoEvalColumn.architecture.name: self.architecture,
135
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
136
- AutoEvalColumn.dummy.name: self.full_model,
137
  AutoEvalColumn.revision.name: self.revision,
138
  AutoEvalColumn.average.name: average,
139
  AutoEvalColumn.license.name: self.license,
133
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
134
  AutoEvalColumn.architecture.name: self.architecture,
135
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
136
+ AutoEvalColumn.fullname.name: self.full_model,
137
  AutoEvalColumn.revision.name: self.revision,
138
  AutoEvalColumn.average.name: average,
139
  AutoEvalColumn.license.name: self.license,
src/tools/collections.py CHANGED
@@ -60,7 +60,7 @@ def update_collections(df: DataFrame):
60
  for size, interval in intervals.items():
61
  filtered_df = _filter_by_type_and_size(df, model_type, interval)
62
  best_models = list(
63
- filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name][:10]
64
  )
65
  print(model_type.value.symbol, size, best_models)
66
  _add_models_to_collection(collection, best_models, model_type, size)
@@ -73,4 +73,4 @@ def update_collections(df: DataFrame):
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
- continue
60
  for size, interval in intervals.items():
61
  filtered_df = _filter_by_type_and_size(df, model_type, interval)
62
  best_models = list(
63
+ filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
64
  )
65
  print(model_type.value.symbol, size, best_models)
66
  _add_models_to_collection(collection, best_models, model_type, size)
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
+ continue