Carol-gutianle commited on
Commit
77f128a
1 Parent(s): c8d20dc
.gitignore CHANGED
@@ -5,9 +5,10 @@ __pycache__/
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
 
8
 
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
  logs/
 
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
8
+ .huggingface
9
 
10
+ # eval-queue/
11
+ # eval-results/
12
+ # eval-queue-bk/
13
+ # eval-results-bk/
14
  logs/
app.py CHANGED
@@ -34,29 +34,13 @@ from src.submission.submit import add_new_eval
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
37
- try:
38
- print(EVAL_REQUESTS_PATH)
39
- snapshot_download(
40
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
- )
42
- except Exception:
43
- restart_space()
44
- try:
45
- print(EVAL_RESULTS_PATH)
46
- snapshot_download(
47
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
- )
49
- except Exception:
50
- restart_space()
51
-
52
-
53
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
  leaderboard_df = original_df.copy()
55
 
56
  (
57
  finished_eval_queue_df,
58
- running_eval_queue_df,
59
- pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
 
@@ -64,13 +48,10 @@ leaderboard_df = original_df.copy()
64
  def update_table(
65
  hidden_df: pd.DataFrame,
66
  columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
  show_deleted: bool,
71
  query: str,
72
  ):
73
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
  filtered_df = filter_queries(query, filtered_df)
75
  df = select_columns(filtered_df, columns)
76
  return df
@@ -82,7 +63,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
82
 
83
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
  always_here_cols = [
85
- AutoEvalColumn.model_type_symbol.name,
86
  AutoEvalColumn.model.name,
87
  ]
88
  # We use COLS to maintain sorting
@@ -105,30 +86,18 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
105
  if len(final_df) > 0:
106
  filtered_df = pd.concat(final_df)
107
  filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
 
109
  )
110
 
111
  return filtered_df
112
 
113
 
114
  def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
  ) -> pd.DataFrame:
117
  # Show all models
118
- if show_deleted:
119
- filtered_df = df
120
- else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
122
-
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
-
127
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
- filtered_df = filtered_df.loc[mask]
131
-
132
  return filtered_df
133
 
134
 
@@ -138,7 +107,7 @@ with demo:
138
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
139
 
140
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
  with gr.Row():
143
  with gr.Column():
144
  with gr.Row():
@@ -167,29 +136,29 @@ with demo:
167
  deleted_models_visibility = gr.Checkbox(
168
  value=False, label="Show gated/private/deleted models", interactive=True
169
  )
170
- with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
- filter_columns_type = gr.CheckboxGroup(
173
- label="Model types",
174
- choices=[t.to_str() for t in ModelType],
175
- value=[t.to_str() for t in ModelType],
176
- interactive=True,
177
- elem_id="filter-columns-type",
178
- )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
 
194
  leaderboard_table = gr.components.Dataframe(
195
  value=leaderboard_df[
@@ -215,23 +184,17 @@ with demo:
215
  [
216
  hidden_leaderboard_table_for_search,
217
  shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
  deleted_models_visibility,
222
  search_bar,
223
  ],
224
  leaderboard_table,
225
  )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
  selector.change(
228
  update_table,
229
  [
230
  hidden_leaderboard_table_for_search,
231
  shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
  deleted_models_visibility,
236
  search_bar,
237
  ],
@@ -239,95 +202,95 @@ with demo:
239
  queue=True,
240
  )
241
 
242
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
244
-
245
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
246
- with gr.Column():
247
- with gr.Row():
248
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
249
-
250
- with gr.Column():
251
- with gr.Accordion(
252
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
253
- open=False,
254
- ):
255
- with gr.Row():
256
- finished_eval_table = gr.components.Dataframe(
257
- value=finished_eval_queue_df,
258
- headers=EVAL_COLS,
259
- datatype=EVAL_TYPES,
260
- row_count=5,
261
- )
262
- with gr.Accordion(
263
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
264
- open=False,
265
- ):
266
- with gr.Row():
267
- running_eval_table = gr.components.Dataframe(
268
- value=running_eval_queue_df,
269
- headers=EVAL_COLS,
270
- datatype=EVAL_TYPES,
271
- row_count=5,
272
- )
273
-
274
- with gr.Accordion(
275
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
276
- open=False,
277
- ):
278
- with gr.Row():
279
- pending_eval_table = gr.components.Dataframe(
280
- value=pending_eval_queue_df,
281
- headers=EVAL_COLS,
282
- datatype=EVAL_TYPES,
283
- row_count=5,
284
- )
285
- with gr.Row():
286
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
287
-
288
- with gr.Row():
289
- with gr.Column():
290
- model_name_textbox = gr.Textbox(label="Model name")
291
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
292
- model_type = gr.Dropdown(
293
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
294
- label="Model type",
295
- multiselect=False,
296
- value=None,
297
- interactive=True,
298
- )
299
-
300
- with gr.Column():
301
- precision = gr.Dropdown(
302
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
303
- label="Precision",
304
- multiselect=False,
305
- value="float16",
306
- interactive=True,
307
- )
308
- weight_type = gr.Dropdown(
309
- choices=[i.value.name for i in WeightType],
310
- label="Weights type",
311
- multiselect=False,
312
- value="Original",
313
- interactive=True,
314
- )
315
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
316
-
317
- submit_button = gr.Button("Submit Eval")
318
- submission_result = gr.Markdown()
319
- submit_button.click(
320
- add_new_eval,
321
- [
322
- model_name_textbox,
323
- base_model_name_textbox,
324
- revision_name_textbox,
325
- precision,
326
- weight_type,
327
- model_type,
328
- ],
329
- submission_result,
330
- )
331
 
332
  with gr.Row():
333
  with gr.Accordion("📙 Citation", open=False):
@@ -342,4 +305,5 @@ with demo:
342
  scheduler = BackgroundScheduler()
343
  scheduler.add_job(restart_space, "interval", seconds=1800)
344
  scheduler.start()
345
- demo.queue(default_concurrency_limit=40).launch()
 
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
38
  leaderboard_df = original_df.copy()
39
 
40
  (
41
  finished_eval_queue_df,
42
+ # running_eval_queue_df,
43
+ # pending_eval_queue_df,
44
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
45
 
46
 
 
48
  def update_table(
49
  hidden_df: pd.DataFrame,
50
  columns: list,
 
 
 
51
  show_deleted: bool,
52
  query: str,
53
  ):
54
+ filtered_df = filter_models(hidden_df)
55
  filtered_df = filter_queries(query, filtered_df)
56
  df = select_columns(filtered_df, columns)
57
  return df
 
63
 
64
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
65
  always_here_cols = [
66
+ # AutoEvalColumn.model_type_symbol.name,
67
  AutoEvalColumn.model.name,
68
  ]
69
  # We use COLS to maintain sorting
 
86
  if len(final_df) > 0:
87
  filtered_df = pd.concat(final_df)
88
  filtered_df = filtered_df.drop_duplicates(
89
+ # subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
90
+ subset=[AutoEvalColumn.model.name]
91
  )
92
 
93
  return filtered_df
94
 
95
 
96
  def filter_models(
97
+ df: pd.DataFrame
98
  ) -> pd.DataFrame:
99
  # Show all models
100
+ filtered_df = df
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  return filtered_df
102
 
103
 
 
107
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
110
+ with gr.TabItem("🏅 MLLMGuard(ASD)", elem_id="llm-benchmark-tab-table", id=0):
111
  with gr.Row():
112
  with gr.Column():
113
  with gr.Row():
 
136
  deleted_models_visibility = gr.Checkbox(
137
  value=False, label="Show gated/private/deleted models", interactive=True
138
  )
139
+ # with gr.Column(min_width=320):
140
+ # #with gr.Box(elem_id="box-filter"):
141
+ # filter_columns_type = gr.CheckboxGroup(
142
+ # label="Model types",
143
+ # choices=[t.to_str() for t in ModelType],
144
+ # value=[t.to_str() for t in ModelType],
145
+ # interactive=True,
146
+ # elem_id="filter-columns-type",
147
+ # )
148
+ # filter_columns_precision = gr.CheckboxGroup(
149
+ # label="Precision",
150
+ # choices=[i.value.name for i in Precision],
151
+ # value=[i.value.name for i in Precision],
152
+ # interactive=True,
153
+ # elem_id="filter-columns-precision",
154
+ # )
155
+ # filter_columns_size = gr.CheckboxGroup(
156
+ # label="Model sizes (in billions of parameters)",
157
+ # choices=list(NUMERIC_INTERVALS.keys()),
158
+ # value=list(NUMERIC_INTERVALS.keys()),
159
+ # interactive=True,
160
+ # elem_id="filter-columns-size",
161
+ # )
162
 
163
  leaderboard_table = gr.components.Dataframe(
164
  value=leaderboard_df[
 
184
  [
185
  hidden_leaderboard_table_for_search,
186
  shown_columns,
 
 
 
187
  deleted_models_visibility,
188
  search_bar,
189
  ],
190
  leaderboard_table,
191
  )
192
+ for selector in [shown_columns, deleted_models_visibility]:
193
  selector.change(
194
  update_table,
195
  [
196
  hidden_leaderboard_table_for_search,
197
  shown_columns,
 
 
 
198
  deleted_models_visibility,
199
  search_bar,
200
  ],
 
202
  queue=True,
203
  )
204
 
205
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
206
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
207
+
208
+ # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
209
+ # with gr.Column():
210
+ # with gr.Row():
211
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
212
+
213
+ # with gr.Column():
214
+ # with gr.Accordion(
215
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
216
+ # open=False,
217
+ # ):
218
+ # with gr.Row():
219
+ # finished_eval_table = gr.components.Dataframe(
220
+ # value=finished_eval_queue_df,
221
+ # headers=EVAL_COLS,
222
+ # datatype=EVAL_TYPES,
223
+ # row_count=5,
224
+ # )
225
+ # with gr.Accordion(
226
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
227
+ # open=False,
228
+ # ):
229
+ # with gr.Row():
230
+ # running_eval_table = gr.components.Dataframe(
231
+ # value=running_eval_queue_df,
232
+ # headers=EVAL_COLS,
233
+ # datatype=EVAL_TYPES,
234
+ # row_count=5,
235
+ # )
236
+
237
+ # with gr.Accordion(
238
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
239
+ # open=False,
240
+ # ):
241
+ # with gr.Row():
242
+ # pending_eval_table = gr.components.Dataframe(
243
+ # value=pending_eval_queue_df,
244
+ # headers=EVAL_COLS,
245
+ # datatype=EVAL_TYPES,
246
+ # row_count=5,
247
+ # )
248
+ # with gr.Row():
249
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
250
+
251
+ # with gr.Row():
252
+ # with gr.Column():
253
+ # model_name_textbox = gr.Textbox(label="Model name")
254
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
255
+ # model_type = gr.Dropdown(
256
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
257
+ # label="Model type",
258
+ # multiselect=False,
259
+ # value=None,
260
+ # interactive=True,
261
+ # )
262
+
263
+ # with gr.Column():
264
+ # precision = gr.Dropdown(
265
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
266
+ # label="Precision",
267
+ # multiselect=False,
268
+ # value="float16",
269
+ # interactive=True,
270
+ # )
271
+ # weight_type = gr.Dropdown(
272
+ # choices=[i.value.name for i in WeightType],
273
+ # label="Weights type",
274
+ # multiselect=False,
275
+ # value="Original",
276
+ # interactive=True,
277
+ # )
278
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
279
+
280
+ # submit_button = gr.Button("Submit Eval")
281
+ # submission_result = gr.Markdown()
282
+ # submit_button.click(
283
+ # add_new_eval,
284
+ # [
285
+ # model_name_textbox,
286
+ # base_model_name_textbox,
287
+ # revision_name_textbox,
288
+ # precision,
289
+ # weight_type,
290
+ # model_type,
291
+ # ],
292
+ # submission_result,
293
+ # )
294
 
295
  with gr.Row():
296
  with gr.Accordion("📙 Citation", open=False):
 
305
  scheduler = BackgroundScheduler()
306
  scheduler.add_job(restart_space, "interval", seconds=1800)
307
  scheduler.start()
308
+ demo.queue(default_concurrency_limit=40).launch()
309
+ # demo.launch(server_name="127.0.0.1", server_port=7855, debug=True)
eval-queue/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-queue/gpt-4v/results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model": "gpt-4v"}
eval-queue/internvl/results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model": "internvl", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 0.1, "license": "custom"}
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/gpt-4v/results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.bfloat16",
4
+ "model_name": "gpt-4v",
5
+ "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
+ },
7
+ "results": {
8
+ "asd_privacy": {
9
+ "asd": 0.2500
10
+ },
11
+ "asd_bias": {
12
+ "asd": 0.1944
13
+ },
14
+ "asd_toxicity": {
15
+ "asd": 0.3247
16
+ },
17
+ "asd_truthfulness": {
18
+ "asd": 0.2115
19
+ },
20
+ "asd_legality": {
21
+ "asd": 0.2542
22
+ }
23
+ }
24
+ }
eval-results/internvl/results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.bfloat16",
4
+ "model_name": "internvl",
5
+ "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
+ },
7
+ "results": {
8
+ "asd_privacy": {
9
+ "asd": 0.3657
10
+ },
11
+ "asd_bias": {
12
+ "asd": 0.3129
13
+ },
14
+ "asd_toxicity": {
15
+ "asd": 0.3285
16
+ },
17
+ "asd_truthfulness": {
18
+ "asd": 0.2050
19
+ },
20
+ "asd_legality": {
21
+ "asd": 0.3278
22
+ }
23
+ }
24
+ }
src/about.py CHANGED
@@ -12,8 +12,11 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,11 +24,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
@@ -69,4 +72,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
72
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("asd_privacy", "asd", "Privacy")
16
+ task1 = Task("asd_bias", "asd", "Bias")
17
+ task2 = Task("asd_toxicity", "asd", "Toxicity")
18
+ task3 = Task("asd_truthfulness", "asd", "Truthfulness")
19
+ task4 = Task("asd_legality", "asd", "Legality")
20
 
21
  NUM_FEWSHOT = 0 # Change with your few shot
22
  # ---------------------------------------------------
 
24
 
25
 
26
  # Your leaderboard name
27
+ TITLE = """<h1 align="center" id="space-title">MLLMGuard Leaderboard</h1>"""
28
 
29
  # What does your leaderboard evaluate?
30
  INTRODUCTION_TEXT = """
31
+ MLLMGuard is a multi-dimensional safety evaluation suite for MLLMs, including a bilingual image-text evaluation dataset, inference utilities, and a set of lightweight evaluators.
32
  """
33
 
34
  # Which evaluations are you running? how can people reproduce what you have?
 
72
 
73
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
74
  CITATION_BUTTON_TEXT = r"""
75
+ @misc{gu2024mllmguard,
76
+ title={MLLMGuard: A Multi-dimensional Safety Evaluation Suite for Multimodal Large Language Models},
77
+ author={Tianle Gu and Zeyang Zhou and Kexin Huang and Dandan Liang and Yixu Wang and Haiquan Zhao and Yuanqi Yao and Xingge Qiao and Keqing Wang and Yujiu Yang and Yan Teng and Yu Qiao and Yingchun Wang},
78
+ year={2024},
79
+ eprint={2406.07594},
80
+ archivePrefix={arXiv},
81
+ primaryClass={cs.CR}
82
+ }
83
  """
src/display/utils.py CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -47,11 +47,11 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
57
  @dataclass
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=False)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("ASD ⬇️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
+ # revision = ColumnContent("revision", "str", True)
51
+ # private = ColumnContent("private", "bool", True)
52
+ # precision = ColumnContent("precision", "str", True)
53
+ # weight_type = ColumnContent("weight_type", "str", "Original")
54
+ # status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
57
  @dataclass
src/leaderboard/read_evals.py CHANGED
@@ -112,18 +112,18 @@ class EvalResult:
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
@@ -164,10 +164,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
164
  continue
165
 
166
  # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
 
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
+ # AutoEvalColumn.precision.name: self.precision.value.name,
116
+ # AutoEvalColumn.model_type.name: self.model_type.value.name,
117
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
+ # AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
+ # AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
+ # AutoEvalColumn.license.name: self.license,
124
+ # AutoEvalColumn.likes.name: self.likes,
125
+ # AutoEvalColumn.params.name: self.num_params,
126
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
 
164
  continue
165
 
166
  # Sort the files by date
167
+ # try:
168
+ # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
+ # except dateutil.parser._parser.ParserError:
170
+ # files = [files[-1]]
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
src/populate.py CHANGED
@@ -14,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
@@ -34,7 +34,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
@@ -46,13 +46,13 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
46
  data = json.load(fp)
47
 
48
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
 
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
 
46
  data = json.load(fp)
47
 
48
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
+ # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
+ # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
+ finished_list = [e for e in all_evals]
55
+ # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
+ # df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
+ return df_finished[cols]