Lakoc commited on
Commit
b66f230
1 Parent(s): 49d6897
Files changed (6) hide show
  1. app.py +84 -345
  2. compare_significance.py +112 -75
  3. content.py +15 -43
  4. model_compare.py +17 -45
  5. server.py +144 -0
  6. tasks_metadata.json +204 -0
app.py CHANGED
@@ -1,390 +1,129 @@
1
- import glob
2
  import os
3
- import logging
4
 
5
- import pandas as pd
6
  import gradio as gr
 
7
  from gradio.themes.utils.sizes import text_md
8
 
9
- from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN,
10
- )
11
-
12
- import json
13
- from datetime import datetime
14
- from pathlib import Path
15
- from uuid import uuid4
16
- import time
17
- import gradio as gr
18
-
19
- from huggingface_hub import HfApi, snapshot_download
20
-
21
- from compare_significance import check_significance, SUPPORTED_METRICS
22
- from model_compare import ModelCompare
23
-
24
- JSON_DATASET_DIR = Path("../json_dataset")
25
- JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
26
-
27
- JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
28
-
29
- api = HfApi()
30
-
31
- ORG= "CZLC"
32
- REPO = f"{ORG}/LLM_benchmark_data"
33
-
34
- def greet(name: str) -> str:
35
- return "Hello " + name + "!"
36
-
37
-
38
- DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
39
 
40
- HF_TOKEN = os.environ.get("HF_TOKEN")
41
 
42
- class LeaderboardServer:
43
- def __init__(self, server_address):
44
- self.server_address = server_address
45
- self.repo_type = "dataset"
46
- self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
47
- self.submisssion_id_to_file = {} # Map submission ids to file paths
48
-
49
- def on_submit(self):
50
- self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
51
 
52
- def get_leaderboard(self):
53
- results = []
54
-
55
- new_results = []
56
- submission_ids = set()
57
-
58
- # pre-computed ranks
59
- with open(os.path.join(self.local_leaderboard, "metadata", "ranks.json")) as ranks_file:
60
- ranks = json.load(ranks_file)
61
- model_compare = ModelCompare()
62
- ranks = model_compare.get_tasks_ranks(ranks)
63
-
64
- # Models data
65
- for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
66
- data = json.load(open(submission))
67
- submission_id = data["metadata"]["model_description"]
68
-
69
- if submission_id in submission_ids:
70
- continue
71
- submission_ids.add(submission_id)
72
-
73
- self.submisssion_id_to_file[submission_id] = submission
74
-
75
-
76
- local_results = {task: list(task_ranks).index(submission_id)+1 for task, task_ranks in ranks.items()}
77
- local_results["submission_id"] = submission_id
78
- results.append(local_results)
79
- dataframe = pd.DataFrame.from_records(results)
80
- # Reorder to have the id (model description) first
81
- df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
82
- dataframe = dataframe[df_order]
83
- return dataframe
84
-
85
- def compute_ranks(self):
86
- ''' Compute rankings on every submit '''
87
-
88
- self.get_leaderboard()
89
 
90
- ids = list(self.submisssion_id_to_file.keys())
91
- rankings = {id: {} for id in ids}
92
 
93
- for a_idx in range(len(ids)):
94
- for b_idx in range(a_idx+1, len(ids)):
95
- modelA_id = ids[a_idx]
96
- modelB_id = ids[b_idx]
97
- res = self.compare_models(modelA_id, modelB_id)
98
- rankings[modelA_id][modelB_id] = {
99
- task: data["significant"] for task,data in res.items()
100
- }
101
- rankings[modelB_id][modelA_id] = {
102
- task: not data["significant"] for task,data in res.items()
103
- }
104
-
105
- return rankings
106
-
107
-
108
- def compare_models(self, modelA, modelB):
109
- modelA_path = self.submisssion_id_to_file.get(modelA)
110
- modelB_path = self.submisssion_id_to_file.get(modelB)
111
- return check_significance(modelA_path, modelB_path)
112
-
113
-
114
- def get_rankings(self):
115
- # TODO retrieve saved rankings for models on tasks
116
- pass
117
 
118
- def save_json(self,file, submission_name) -> None:
119
- filename = os.path.basename(file)
120
- api.upload_file(
121
- path_or_fileobj=file,
122
- path_in_repo=f"data/{submission_name}_{filename}",
123
- repo_id=self.server_address,
124
- repo_type=self.repo_type,
125
- token=HF_TOKEN,
126
- )
127
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- leaderboard_server = LeaderboardServer(REPO)
130
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
 
 
131
 
132
 
133
- LEADERBOARD_TYPES = ['LLM',]
134
- MAX_SUBMISSIONS_PER_24H = 2
135
- # DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
136
- # CHALLENGE_NAME = 'NOTSOFAR1'
 
 
 
 
137
 
138
 
 
 
 
 
 
139
 
140
 
141
- # if __name__ == '__main__':
142
  with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
143
  app_state = gr.State({})
144
- # with gr.Row():
145
- # greet_name = gr.Textbox(label="Name")
146
- # greet_output = gr.Textbox(label="Greetings")
147
- # greet_btn = gr.Button("Greet")
148
- # greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
149
- # fn=save_json,
150
- # inputs=[greet_name, greet_output],
151
- # outputs=None,
152
- # )
153
 
154
  with gr.Row():
155
  with gr.Row():
156
  gr.Markdown(HEADER_MARKDOWN)
157
 
158
  with gr.Row():
159
-
160
- # Leaderboards Tab #
161
- ####################
162
- def populate_leaderboard(leaderboard_type, dataset_version):
163
- gr.Info('Loading leaderboard...')
164
- time.sleep(1)
165
- leaderboard_df = leaderboard_server.get_leaderboard()
166
- # leaderboard_df = lb_server.get_leaderboard(
167
- # submission_type=leaderboard_type, dataset_version=dataset_version)
168
- # if leaderboard_df.empty:
169
- return leaderboard_df
170
- # return leaderboard_df
171
-
172
-
173
- def create_leaderboard_tab(tab_name: str, idx: int, dataset_version_dropdown: gr.Dropdown):
174
- # dataset_version = dataset_version_dropdown.value
175
- print(f'Creating tab for {tab_name}, idx={idx}, dataset_version={dataset_version_dropdown}')
176
- with gr.Tab(id=tab_name, label=tab_name) as leaderboard_tab:
177
- leaderboard_table = gr.DataFrame(populate_leaderboard(tab_name, None)) if idx == 0 \
178
- else gr.DataFrame(pd.DataFrame(columns=['No submissions yet']))
179
- leaderboard_tab.select(fn=populate_leaderboard,
180
- inputs=[gr.Text(tab_name, visible=False)],
181
- outputs=[leaderboard_table])
182
- return leaderboard_table
183
-
184
- def on_dropdown_change():
185
- first_tab_name = LEADERBOARD_TYPES[0]
186
- leaderboard_server.on_submit()
187
-
188
- return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
189
-
190
-
191
  with gr.Tab('Leaderboard') as leaderboards_tab:
192
- # with gr.Row():
193
- # gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
194
- # with gr.Row():
195
- # with gr.Column():
196
- # dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
197
- # value=DATASET_VERSIONS[-1], label="Dataset",
198
- # interactive=True)
199
- # with gr.Column():
200
- # gr.Markdown('') # Empty column for spacing
201
- # with gr.Column():
202
- # gr.Markdown('') # Empty column for spacing
203
- # with gr.Column():
204
- # gr.Markdown('') # Empty column for spacing
205
- # with gr.Row():
206
- # with gr.Tabs() as leaderboards_tabs:
207
- # leaderboard_tables_list = []
208
- # for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
209
- # l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
210
- # leaderboard_tables_list.append(l_tab)
211
-
212
- # change the table based on the selected model
213
- def on_dropdown_change(model_detail):
214
- leaderboard = leaderboard_server.get_leaderboard()
215
- return leaderboard[leaderboard["submission_id"] == model_detail]
216
 
217
- results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None, visible=True)
218
- model_detail = gr.Dropdown(choices=list(leaderboard_server.get_leaderboard()["submission_id"]), label="Select model", interactive=True)
219
- model_detail_button = gr.Button("Show model detail", interactive=True)
220
- model_detail_button.click(
221
- fn=on_dropdown_change,
222
- inputs=[model_detail],
223
- outputs=[results_table]
224
- )
225
-
226
- # results_table.select(fn=on_dropdown_change, inputs=[model_detail], outputs=[results_table])
227
-
228
- # dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
229
- # outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
230
-
231
-
232
-
233
- ##################
234
- # Submission Tab #
235
- ##################
236
  with gr.Tab('Submission'):
237
  with gr.Column():
238
- def on_submit_pressed():
239
- return gr.update(value='Processing submission...', interactive=False)
240
-
241
- def validate_submission_inputs(team_name, submission_zip, submission_type, token):
242
- if not team_name or not submission_zip or not submission_type:
243
- raise ValueError('Please fill in all fields')
244
- if not os.path.exists(submission_zip):
245
- raise ValueError('File does not exist')
246
- # if not submission_zip.endswith('.zip'):
247
- # raise ValueError('File must be a zip')
248
- # if not token:
249
- # raise ValueError('Please insert a valid Hugging Face token')
250
-
251
- def process_submission(team_name, submission, submission_type, description,
252
- app_state, request: gr.Request):
253
- logging.info(f'{team_name}: new submission for track: {submission_type}')
254
- try:
255
- token = app_state.get('hf_token')
256
- validate_submission_inputs(team_name, submission, submission_type, token)
257
- except ValueError as err:
258
- gr.Warning(str(err))
259
- return
260
-
261
-
262
- # metadata = {'challenge_name': CHALLENGE_NAME,
263
- # "dataset_version": DATASET_VERSIONS[-1],
264
- # 'team_name': team_name,
265
- # 'submission_type': submission_type,
266
- # 'description': description,
267
- # 'token': token,
268
- # 'file_name': os.path.basename(submission_zip),
269
- # 'file_size_mb': os.path.getsize(submission_zip) / 1024 / 1024,
270
- # 'ip': request.client.host}
271
- leaderboard_server.save_json(submission,team_name)
272
-
273
- try:
274
- gr.Info('Processing submission...')
275
- # response = lb_server.add_submission(token=token, file_path=submission_zip, metadata=metadata)
276
- # if 'error' in response:
277
- # gr.Warning(f'Failed to process submission - {response["error"]}')
278
- # else:
279
- gr.Info('Done processing submission')
280
- except Exception as e:
281
- gr.Warning(f'Submission failed to upload - {e}')
282
-
283
- def on_submit_done():
284
- on_dropdown_change()
285
- leaderboard_server.on_submit()
286
- # leaderboard_tab.children[0] = gr.DataFrame(populate_leaderboard(None, None))
287
- # leaderboard_tab.render()
288
- return gr.update(value='Submit', interactive=True)
289
-
290
- def show_leaderboard():
291
- gr.Info("Loding leaderboard...")
292
- return leaderboard_server.get_leaderboard()
293
 
294
- gr.Markdown(
295
- """
296
- # Model submission
297
- Model can be compared with other models and submitted\n
298
- Click **Compare results** to compare your model with other models in the leaderboard\n
299
- Click **Submit results** to submit your model to the leaderboard
300
- (Comparison by itself is not a submission)
301
- """
302
- )
303
-
304
- submission_team_name_tb = gr.Textbox(label='Team Name')
305
- # submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
306
  with gr.Row():
307
  description_tb = gr.Textbox(label='Description', type='text')
308
  link_to_model_tb = gr.Textbox(label='Link to model', type='text')
309
 
310
- with gr.Row():
311
- hf_token_tb = gr.Textbox(label='Token', type='password')
312
- submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
313
-
314
  submission_file_path = gr.File(label='Upload your results', type='filepath')
315
- compare_results_button = gr.DataFrame(show_leaderboard(), interactive=False, label=None, visible=True)
316
-
317
- # Button that triggers shows the current leaderboard
318
- show_results_button = gr.Button("Compare results", interactive=True)
319
- show_results_button.click(
320
- fn=show_leaderboard,
321
- outputs=[compare_results_button]
322
- )
323
-
324
- submission_btn = gr.Button(value='Submit results', interactive=True)
325
- submission_btn.click(
326
- fn=on_submit_pressed,
327
- outputs=[submission_btn]
328
- ).then(
329
- fn=process_submission,
330
- inputs=[submission_team_name_tb, submission_file_path, description_tb, app_state]
331
- ).then(
332
- fn=on_submit_done,
333
- outputs=[submission_btn]
334
- )
335
-
336
- # .then(
337
- # fn=on_dropdown_change,
338
- # outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
339
- # )
340
-
341
 
342
- # # My Submissions Tab #
343
- # ######################
344
- # with gr.Tab('My Submissions') as my_submissions_tab:
345
- # def on_my_submissions_tab_select(app_state):
346
- # hf_token = app_state.get('hf_token')
347
- # if not hf_token:
348
- # return pd.DataFrame(columns=['Please insert your Hugging Face token'])
349
- # # submissions = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
350
- # # if submissions.empty:
351
- # # submissions = pd.DataFrame(columns=['No submissions yet'])
352
- # # return submissions
353
- #
354
- # gr.Markdown(MY_SUBMISSIONS_TAB_TITLE_MARKDOWN)
355
- # my_submissions_table = gr.DataFrame()
356
- #
357
- # my_submissions_tab.select(fn=on_my_submissions_tab_select, inputs=[app_state],
358
- # outputs=[my_submissions_table])
359
- # my_submissions_token_tb = gr.Textbox(label='Token', type='password')
360
-
361
- def on_token_insert(hf_token, app_state):
362
- gr.Info(f'Verifying token...')
363
 
364
- submission_count = None
365
- # if hf_token:
366
- # submission_count = lb_server.get_submission_count_last_24_hours(hf_token=hf_token)
 
 
 
367
 
368
- if submission_count is None:
369
- # Invalid token
370
- app_state['hf_token'] = None
371
- submissions_24h_str = ''
372
- team_submissions_df = pd.DataFrame(columns=['Invalid Token'])
373
- gr.Warning('Invalid token')
374
 
375
- # else:
376
- # app_state['hf_token'] = hf_token
377
- # submissions_24h_str = f'{submission_count}/{MAX_SUBMISSIONS_PER_24H}'
378
- # team_submissions_df = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
379
- # if team_submissions_df.empty:
380
- # team_submissions_df = pd.DataFrame(columns=['No submissions yet'])
381
- # gr.Info('Token verified!')
382
 
383
- return app_state, team_submissions_df, submissions_24h_str
 
 
 
 
 
 
 
 
 
384
 
385
- hf_token_tb.change(fn=on_token_insert, inputs=[hf_token_tb, app_state],
386
- outputs=[app_state, submissions_24h_txt])
387
- # my_submissions_token_tb.change(fn=on_token_insert, inputs=[my_submissions_token_tb, app_state],
388
- # outputs=[app_state, my_submissions_table, submissions_24h_txt])
 
 
 
 
 
 
389
 
390
  main.launch()
 
 
1
  import os
 
2
 
 
3
  import gradio as gr
4
+ import pandas as pd
5
  from gradio.themes.utils.sizes import text_md
6
 
7
+ from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN)
8
+ from server import LeaderboardServer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ leaderboard_server = LeaderboardServer()
11
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def on_submit_pressed():
14
+ return gr.update(value='Processing submission...', interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
16
 
17
+ def validate_submission_inputs(team_name, submission_id, link_to_model, submission_file):
18
+ if not team_name or not submission_id or not link_to_model or not submission_file:
19
+ raise ValueError('Please fill in all fields')
20
+ if not os.path.exists(submission_file):
21
+ raise ValueError('File does not exist')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def process_submission(team_name, submission_id, description, link_to_model, submission_file):
25
+ try:
26
+ validate_submission_inputs(team_name, submission_id, link_to_model, submission_file)
27
+ metadata = {
28
+ "team_name": team_name,
29
+ "submission_id": submission_id,
30
+ "description": description,
31
+ "link_to_model": link_to_model,
32
+ }
33
+ gr.Info('Submission valid, running local tournament...')
34
 
35
+ leaderboard_server.prepare_model_for_submission(submission_file, metadata)
36
+ except ValueError as err:
37
+ gr.Warning(str(err))
38
+ return gr.update(visible=False), gr.update(visible=True), gr.update(interactive=True,
39
+ visible=True), gr.update(
40
+ interactive=True, visible=True), gr.update(visible=True), gr.update(
41
+ value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]), visible=True)
42
 
43
 
44
+ def submit_results():
45
+ leaderboard_server.save_pre_submit()
46
+ leaderboard_server.update_leaderboard()
47
+ gr.Info('Submission successful!')
48
+ return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
49
+ visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
50
+ visible=False), gr.update(visible=False), gr.DataFrame(
51
+ value=leaderboard_server.get_leaderboard(), visible=True)
52
 
53
 
54
+ def erase_presubmit():
55
+ leaderboard_server.pre_submit = None
56
+ return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
57
+ visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
58
+ visible=False), gr.update(visible=False)
59
 
60
 
 
61
  with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
62
  app_state = gr.State({})
 
 
 
 
 
 
 
 
 
63
 
64
  with gr.Row():
65
  with gr.Row():
66
  gr.Markdown(HEADER_MARKDOWN)
67
 
68
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  with gr.Tab('Leaderboard') as leaderboards_tab:
70
+ gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
71
+ results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None,
72
+ visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  with gr.Tab('Submission'):
75
  with gr.Column():
76
+ gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
77
+ with gr.Row():
78
+ submission_team_name_tb = gr.Textbox(label='Team Name')
79
+ submission_id_tb = gr.Textbox(label='Submission ID')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  with gr.Row():
82
  description_tb = gr.Textbox(label='Description', type='text')
83
  link_to_model_tb = gr.Textbox(label='Link to model', type='text')
84
 
 
 
 
 
85
  submission_file_path = gr.File(label='Upload your results', type='filepath')
86
+ pre_submission_btn = gr.Button(value='Pre-submit model', interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ submit_prompt = gr.Markdown(
89
+ """
90
+ Do you really want to submit a model? This action is irreversible.
91
+ """,
92
+ visible=False
93
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ pre_submit_info = gr.Markdown(
96
+ """
97
+ This is how will ranking look like after your submission:
98
+ """,
99
+ visible=False
100
+ )
101
 
102
+ pre_submit_table = gr.DataFrame(pd.DataFrame(), interactive=False, label=None, visible=False)
 
 
 
 
 
103
 
104
+ submission_btn_yes = gr.Button(value='Submit model', interactive=False, visible=False)
105
+ submission_btn_no = gr.Button(value='Reverse process', interactive=False, visible=False)
 
 
 
 
 
106
 
107
+ pre_submission_btn.click(
108
+ fn=on_submit_pressed,
109
+ outputs=[pre_submission_btn]
110
+ ).then(
111
+ fn=process_submission,
112
+ inputs=[submission_team_name_tb, submission_id_tb, description_tb, link_to_model_tb,
113
+ submission_file_path],
114
+ outputs=[pre_submission_btn, submit_prompt, submission_btn_yes, submission_btn_no, pre_submit_info,
115
+ pre_submit_table]
116
+ )
117
 
118
+ submission_btn_yes.click(
119
+ fn=submit_results,
120
+ outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
121
+ pre_submit_table, results_table]
122
+ )
123
+ submission_btn_no.click(
124
+ fn=erase_presubmit,
125
+ outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
126
+ pre_submit_table]
127
+ )
128
 
129
  main.launch()
compare_significance.py CHANGED
@@ -3,20 +3,18 @@ import json
3
  from collections import defaultdict
4
  from typing import Sequence
5
 
6
- import numpy
7
  import numpy as np
8
- from scipy.stats import ttest_ind, ttest_rel
 
9
  from sklearn.metrics import roc_curve, auc
10
  from tqdm import tqdm
11
 
12
- # from leaderboard import SUPPORTED_METRICS
13
-
14
  SUPPORTED_METRICS = [
15
  "avg_mcauroc", # for classification tasks
16
- "em", # for QA tasks
17
  "acc", # for multichoice tasks
18
- "rouge", # for summarization tasks
19
- "ppl", # for language modeling tasks
20
  ]
21
 
22
 
@@ -44,43 +42,70 @@ def _get_CMs(i, probabilities, references, thresholds):
44
  return confusion_matrices
45
 
46
 
47
- def compute_significance_accuracy(predsA, referencesA, predsB, referencesB):
48
- # following https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L164C32-L165C24
49
- # Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half
50
- scores_A = [1 if pred == ref else 0 for pred, ref in zip(predsA, referencesA)]
51
- scores_B = [1 if pred == ref else 0 for pred, ref in zip(predsB, referencesB)]
52
  t, p = ttest_rel(scores_A, scores_B)
53
  # correct for one-tailed test
54
  p_value = p / 2
55
- delta = np.mean(scores_A) - np.mean(scores_B)
56
  return p_value, delta
57
 
58
- def compute_significance_em(predsA, referencesA, predsB, referencesB):
59
- pass
60
- def compute_significance_rouge(predsA, referencesA, predsB, referencesB):
61
- # TODO: MDocekal
62
- # Use bootstrapping
63
- # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
64
- pass
65
- def compute_significance_ppl(predsA, referencesA, predsB, referencesB):
66
- # TODO: MDocekal
67
- # Use bootstrapping
68
- # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
69
- pass
 
 
 
 
 
 
 
 
 
 
70
  def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
71
  probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
72
  # compute MC-AUC for model A
73
- model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=1_000)
74
- model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=1_000)
 
75
 
76
  # one-tailed test
77
  p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
78
  / (len(model_A_scores) * len(model_B_scores)))
79
 
80
- delta = np.mean(model_A_scores) - np.mean(model_B_scores)
81
  return p_value, delta
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
85
  n_classes = list(range(len(probs[0])))
86
  fpr = dict()
@@ -93,23 +118,24 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
93
  y_score=[prob[i] for prob in probs])
94
 
95
  confusion_matrices = _get_CMs(i, probs, references, thresholds)
 
96
 
97
  λ = 1.0 # <- Flat prior
98
  # λ = 0.5 # <- Jeffrey's prior
99
 
100
  # sample variates for every threshold
101
- tpr_variates_for_each_fpr = []
102
- for k in range(len(thresholds[i])):
103
- tpr_variates_for_each_fpr.append(
104
- numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
 
105
 
106
  # fprs x tpr_variates
107
- tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
108
 
109
  # now pick 1 variate for each fpr, and compute AUC
110
  auc_scores = []
111
- for tpr_variates in tqdm(tpr_variates_for_each_fpr.T,
112
- desc=f"Computing AUCs for class {i + 1}/{len(n_classes)}"):
113
  auc_score = auc(fpr[i], tpr_variates)
114
  # if numpy.isnan(auc_score):
115
  # auc_score = 0
@@ -141,18 +167,27 @@ def read_json(file_path):
141
  golds = unzipped_list[0]
142
  probs = unzipped_list[1]
143
  data[task] = (golds, probs), metric
144
- return data, fc["metadata"]
145
-
146
-
147
- def check_significance_task(fileA, fileB, task, significance_level=0.05):
148
-
149
- dataA, metadataA = read_json(fileA)
150
- dataB, metadataB = read_json(fileB)
151
-
152
- print("DEBUG",fileA, task, dataA[task])
153
-
154
- decisions = dict()
155
-
 
 
 
 
 
 
 
 
 
156
  metricA = dataA[task][1]
157
  metricB = dataB[task][1]
158
  assert metricA == metricB
@@ -160,32 +195,33 @@ def check_significance_task(fileA, fileB, task, significance_level=0.05):
160
 
161
  if metricA == "avg_mcauroc":
162
  p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
163
- probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
164
-
165
- elif metricA == "acc":
166
- p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
167
- predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
168
- elif metricA == "em":
169
- raise NotImplementedError("Exact match is not supported yet.")
170
- elif metricA == "rouge":
171
- raise NotImplementedError("Rouge is not supported yet.")
172
- elif metricA == "ppl":
173
- raise NotImplementedError("Perplexity is not supported yet.")
174
  else:
175
  raise ValueError(f"Unsupported metric {metricA}")
176
- decisions[task] = {
 
 
 
 
177
  "significant": not (p_value > significance_level),
178
  "p_value": p_value,
179
  "delta": delta,
180
  }
181
- return decisions
182
 
183
  def check_significance(fileA, fileB, significance_level=0.05):
184
- dataA, metadataA = read_json(fileA)
185
- dataB, metadataB = read_json(fileB)
186
-
187
  decisions = dict()
188
- for task in dataA.keys():
 
 
189
  metricA = dataA[task][1]
190
  metricB = dataB[task][1]
191
  assert metricA == metricB
@@ -195,37 +231,38 @@ def check_significance(fileA, fileB, significance_level=0.05):
195
  p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
196
  probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
197
 
198
- elif metricA == "acc":
199
- p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
200
- predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
201
- elif metricA == "em":
202
- raise NotImplementedError("Exact match is not supported yet.")
203
- elif metricA == "rouge":
204
- raise NotImplementedError("Rouge is not supported yet.")
205
- elif metricA == "ppl":
206
- raise NotImplementedError("Perplexity is not supported yet.")
207
  else:
208
  raise ValueError(f"Unsupported metric {metricA}")
 
 
209
  decisions[task] = {
210
  "significant": not (p_value > significance_level),
211
  "p_value": p_value,
212
  "delta": delta,
213
  }
 
214
  return decisions
215
 
216
 
217
  def main():
218
  parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
219
- parser.add_argument("--modelA", help="ModelA JSONL file from lm harness.")
220
- parser.add_argument("--modelB", help="ModelB JSONL file from lm harness.")
221
  parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
222
  args = parser.parse_args()
223
 
224
  result = check_significance(args.modelA, args.modelB, args.significance_level)
225
  print(json.dumps(result, indent=2))
226
 
 
227
  # harness already returns stderr estimate for sampling distribution
228
  # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
229
 
230
  if __name__ == "__main__":
 
231
  main()
 
3
  from collections import defaultdict
4
  from typing import Sequence
5
 
 
6
  import numpy as np
7
+ from numba import njit, prange
8
+ from scipy.stats import ttest_rel
9
  from sklearn.metrics import roc_curve, auc
10
  from tqdm import tqdm
11
 
 
 
12
  SUPPORTED_METRICS = [
13
  "avg_mcauroc", # for classification tasks
14
+ "exact_match", # for QA tasks
15
  "acc", # for multichoice tasks
16
+ "rouge_raw_r2_mid_f", # for summarization tasks
17
+ "word_perplexity", # for language modeling tasks
18
  ]
19
 
20
 
 
42
  return confusion_matrices
43
 
44
 
45
+ def compute_significance_ttest(scores_A, scores_B):
46
+ delta = np.mean(scores_A) - np.mean(scores_B)
47
+ if delta <= 0:
48
+ return 1.0, delta
 
49
  t, p = ttest_rel(scores_A, scores_B)
50
  # correct for one-tailed test
51
  p_value = p / 2
 
52
  return p_value, delta
53
 
54
+
55
+ @njit(parallel=True)
56
+ def compute_significance_bootstrap(scores_A, scores_B):
57
+ n = len(scores_A)
58
+ R = 1_000
59
+ delta_orig = np.mean(scores_A) - np.mean(scores_B)
60
+
61
+ if delta_orig <= 0:
62
+ return 1.0, delta_orig
63
+ r = 0
64
+ for _ in prange(R):
65
+ samples = np.random.choice(n, n, replace=True)
66
+ temp_A = scores_A[samples]
67
+ temp_B = scores_B[samples]
68
+ delta = np.mean(temp_A) - np.mean(temp_B)
69
+ if delta > 2 * delta_orig:
70
+ r += 1
71
+
72
+ pval = r / R
73
+ return pval, delta_orig
74
+
75
+
76
  def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
77
  probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
78
  # compute MC-AUC for model A
79
+ model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
80
+ model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
81
+ delta = np.mean(model_A_scores) - np.mean(model_B_scores)
82
 
83
  # one-tailed test
84
  p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
85
  / (len(model_A_scores) * len(model_B_scores)))
86
 
 
87
  return p_value, delta
88
 
89
 
90
+ # Helper function to convert confusion matrices to numba-compatible arrays
91
+ def convert_confusion_matrices(confusion_matrices):
92
+ num_thresholds = len(confusion_matrices)
93
+ tp = np.empty(num_thresholds)
94
+ fn = np.empty(num_thresholds)
95
+ for k in range(num_thresholds):
96
+ tp[k] = confusion_matrices[k]["TP"]
97
+ fn[k] = confusion_matrices[k]["FN"]
98
+ return tp, fn
99
+
100
+
101
+ @njit(parallel=True)
102
+ def compute_tpr_variates(tp, fn, λ, Nsamples, num_thresholds):
103
+ tpr_variates_for_each_fpr = np.empty((num_thresholds, Nsamples))
104
+ for k in prange(num_thresholds):
105
+ tpr_variates_for_each_fpr[k, :] = np.random.beta(tp[k] + λ, fn[k] + λ, Nsamples)
106
+ return tpr_variates_for_each_fpr
107
+
108
+
109
  def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
110
  n_classes = list(range(len(probs[0])))
111
  fpr = dict()
 
118
  y_score=[prob[i] for prob in probs])
119
 
120
  confusion_matrices = _get_CMs(i, probs, references, thresholds)
121
+ tp, fn = convert_confusion_matrices(confusion_matrices)
122
 
123
  λ = 1.0 # <- Flat prior
124
  # λ = 0.5 # <- Jeffrey's prior
125
 
126
  # sample variates for every threshold
127
+ # tpr_variates_for_each_fpr = []
128
+ # for k in range(len(thresholds[i])):
129
+ # tpr_variates_for_each_fpr.append(
130
+ # numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
131
+ tpr_variates_for_each_fpr = compute_tpr_variates(tp, fn, λ, Nsamples, len(thresholds[i]))
132
 
133
  # fprs x tpr_variates
134
+ # tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
135
 
136
  # now pick 1 variate for each fpr, and compute AUC
137
  auc_scores = []
138
+ for tpr_variates in tpr_variates_for_each_fpr.T:
 
139
  auc_score = auc(fpr[i], tpr_variates)
140
  # if numpy.isnan(auc_score):
141
  # auc_score = 0
 
167
  golds = unzipped_list[0]
168
  probs = unzipped_list[1]
169
  data[task] = (golds, probs), metric
170
+ else:
171
+ scores = [line[metric] for line in fc["predictions"][task]]
172
+ data[task] = scores, metric
173
+
174
+ # make sure all tasks are submitted
175
+ METADATA_FILE = "tasks_metadata.json"
176
+ with open(METADATA_FILE, "r") as f:
177
+ metadata = json.load(f)
178
+
179
+ all_tasks = list(metadata["tasks"].keys())
180
+ all_missing_tasks = []
181
+ for task in all_tasks:
182
+ if task not in data:
183
+ all_missing_tasks.append(task)
184
+ if len(all_missing_tasks) > 0:
185
+ EOLN = "\n"
186
+ raise ValueError(f"Missing tasks in {file_path}: {EOLN.join(all_missing_tasks)}")
187
+ return data
188
+
189
+
190
+ def process_task(task, dataA, dataB, significance_level):
191
  metricA = dataA[task][1]
192
  metricB = dataB[task][1]
193
  assert metricA == metricB
 
195
 
196
  if metricA == "avg_mcauroc":
197
  p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
198
+ probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
199
+ elif metricA in ["acc", "exact_match"]:
200
+ p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
201
+ elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
202
+ p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
203
+ scores_B=np.array(dataB[task][0]))
 
 
 
 
 
204
  else:
205
  raise ValueError(f"Unsupported metric {metricA}")
206
+
207
+ if delta <= 0:
208
+ p_value = 1.0
209
+
210
+ return task, {
211
  "significant": not (p_value > significance_level),
212
  "p_value": p_value,
213
  "delta": delta,
214
  }
215
+
216
 
217
  def check_significance(fileA, fileB, significance_level=0.05):
218
+ dataA = read_json(fileA)
219
+ dataB = read_json(fileB)
220
+
221
  decisions = dict()
222
+ _iter = tqdm(list(dataA.keys()))
223
+ for task in _iter:
224
+ _iter.set_description(f"Processing task: {task}")
225
  metricA = dataA[task][1]
226
  metricB = dataB[task][1]
227
  assert metricA == metricB
 
231
  p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
232
  probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
233
 
234
+ elif metricA in ["acc", "exact_match"]:
235
+ p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
236
+ elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
237
+ p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
238
+ scores_B=np.array(dataB[task][0]))
 
 
 
 
239
  else:
240
  raise ValueError(f"Unsupported metric {metricA}")
241
+ if delta <= 0:
242
+ p_value = 1.0
243
  decisions[task] = {
244
  "significant": not (p_value > significance_level),
245
  "p_value": p_value,
246
  "delta": delta,
247
  }
248
+
249
  return decisions
250
 
251
 
252
  def main():
253
  parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
254
+ parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
255
+ parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
256
  parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
257
  args = parser.parse_args()
258
 
259
  result = check_significance(args.modelA, args.modelB, args.significance_level)
260
  print(json.dumps(result, indent=2))
261
 
262
+
263
  # harness already returns stderr estimate for sampling distribution
264
  # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
265
 
266
  if __name__ == "__main__":
267
+ check_significance("../csmpt.json", "../llama3_instruct.json", 0.05)
268
  main()
content.py CHANGED
@@ -2,55 +2,27 @@
2
  This file contains the text content for the leaderboard client.
3
  """
4
 
5
-
6
- # HEADER_MARKDOWN = """
7
- # # CHiME-8 Leaderboard
8
- # In collaboration with the CHiME-8 Challenge, the NOTSOFAR team is proud to host the official leaderboard for the three tasks this year.\n
9
- # For details, visit:
10
- # 1. [DASR](https://www.chimechallenge.org/current/task1/index)
11
- # 2. [NOTSOFAR](https://www.chimechallenge.org/current/task2/index)
12
- # 3. [MMCSG](https://www.chimechallenge.org/current/task3/index)
13
- #
14
- #
15
- # ### DASR and NOTSOFAR - the scientific story
16
- # Both tasks focus on distant automatic speech recognition and speaker diarization, offering a fundamental comparison
17
- # among different system designs:
18
- # - Single-channel (SC), 1 device (NOTSOFAR-SC)
19
- # - Multi-channel (MC), known-geometry, 1 device (NOTSOFAR-MC)
20
- # - Multi-channel (MC), geometry-agnostic, multiple devices (DASR-Constrained-LM and DASR-Unconstrained-LM)
21
- #
22
- # Featured in both tasks, the NOTSOFAR recorded meeting dataset is leveraged as a common benchmark:
23
- # each geometry-agnostic MC system submitted to DASR tracks (constrained or not) will also be **automatically submitted**
24
- # to the known-geometry single-device NOTSOFAR-MC track. These entries will be marked with "DASR" to denote their origin.
25
- # """
26
- HEADER_MARKDOWN = """ """
27
 
28
  LEADERBOARD_TAB_TITLE_MARKDOWN = """
29
- ## Leaderboard
30
- """
31
-
 
32
 
33
  SUBMISSION_TAB_TITLE_MARKDOWN = """
34
  ## Submission
 
35
 
36
- To submit your results, please fill in the form below.
37
-
38
- - *Team Name:* The name of your team, as it will appear on the leaderboard'
39
- - *Results:* Results zip file to submit
40
- - *Submission track:* The track to submit results to
41
- - *Token:* Your Hugging Face token
42
  - *Description:* Short description of your submission (optional)
 
43
 
44
- **Hugging Face tokens:** To create a token, go to your profile settings > Access Tokens > New Token.
45
- Name the token and give it a write role, then copy the token and paste it in the field below.
46
-
47
- **Team creation:** Upon the first submission, your team name is associated with your Hugging Face user account.
48
- Any token generated by your account can be used. All team members should use this specific user's token for
49
- future submissions.
50
-
51
- **Submission limit:** 5 submissions per team every 24 hours. Each participant should only belong to one team.
52
- Changing team names is allowed, but it is not intended to bypass the daily submission limit.
53
- """
54
-
55
- SUBMISSION_TAB_TITLE_MARKDOWN = """
56
  """
 
2
  This file contains the text content for the leaderboard client.
3
  """
4
 
5
+ HEADER_MARKDOWN = """
6
+ # BenCzechMark
7
+ Welcome to the leaderboard! Here you can submit your model and compare it with the existing models.
8
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  LEADERBOARD_TAB_TITLE_MARKDOWN = """
11
+ ## Leaderboard
12
+ The leaderboard below shows the current ranking of the models...
13
+
14
+ """
15
 
16
  SUBMISSION_TAB_TITLE_MARKDOWN = """
17
  ## Submission
18
+ To submit your model, please fill in the form below.
19
 
20
+ - *Team name:* The name of your team, as it will appear on the leaderboard'
21
+ - *Submission ID:* Results json file to submit
 
 
 
 
22
  - *Description:* Short description of your submission (optional)
23
+ - *Link to model:* Link to the model's repository or documentation
24
 
25
+ After filling in the form, click the **Pre-submit model** button.
26
+ This will run a comparison of your model with the existing leaderboard models.
27
+ After the tournament is complete, you will be able to submit your model to the leaderboard.
 
 
 
 
 
 
 
 
 
28
  """
model_compare.py CHANGED
@@ -1,62 +1,34 @@
1
-
2
  from functools import cmp_to_key
3
- from compare_significance import check_significance
4
-
5
- class ModelCompare():
6
-
7
- TASKS = ["propaganda_demonizace",
8
- "propaganda_vina",
9
- "propaganda_relativizace",
10
- "propaganda_argumentace",
11
- "propaganda_lokace",
12
- "propaganda_nazor",
13
- "propaganda_emoce",
14
- "propaganda_fabulace",
15
- "propaganda_nalepkovani",
16
- "propaganda_zamereni",
17
- "propaganda_zanr",
18
- "propaganda_rusko",
19
- "propaganda_strach",
20
- "benczechmark_sentiment"]
21
-
22
- def __init__(self, ranks:dict=None):
23
  self.ranks = ranks
 
24
 
25
- def compare_models(self, modelA_id, modelB_id):
26
  if not self.ranks:
27
  raise Exception("Missing model rankings")
28
-
29
- res = self.ranks[modelA_id][modelB_id][self.current_task]
30
- if res == True:
31
  return 1
32
- elif res == False:
33
  return -1
34
  else:
35
  return -1
36
 
 
 
37
 
38
- def get_tasks_ranks(self, ranks:dict) -> dict:
39
- '''Order models based on the significance improvement'''
40
-
41
  self.ranks = ranks
42
-
43
  tasks_ranks = {}
44
-
45
  models = ranks.keys()
46
- for task in self.TASKS:
47
  self.current_task = task
48
  tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
49
  return tasks_ranks
50
-
51
-
52
- # models = {
53
- # model1 : {
54
- # task1 : order_idx
55
- # task2 : order_idx
56
- # task3 : order_idx
57
- # }
58
- # }
59
-
60
-
61
-
62
-
 
 
1
  from functools import cmp_to_key
2
+
3
+
4
+ class ModelCompare:
5
+
6
+ def __init__(self, tasks, ranks: dict = None):
7
+ self.current_task = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  self.ranks = ranks
9
+ self.tasks = tasks
10
 
11
+ def compare_models(self, model_a, model_b):
12
  if not self.ranks:
13
  raise Exception("Missing model rankings")
14
+
15
+ res = self.ranks[model_a][model_b][self.current_task]
16
+ if res:
17
  return 1
18
+ elif not res:
19
  return -1
20
  else:
21
  return -1
22
 
23
+ def get_tasks_ranks(self, ranks: dict) -> dict:
24
+ """Order models based on the significance improvement"""
25
 
 
 
 
26
  self.ranks = ranks
27
+
28
  tasks_ranks = {}
29
+
30
  models = ranks.keys()
31
+ for task in self.tasks:
32
  self.current_task = task
33
  tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
34
  return tasks_ranks
 
 
 
 
 
 
 
 
 
 
 
 
 
server.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import glob
3
+ import json
4
+ import os
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from huggingface_hub import HfApi, snapshot_download
9
+
10
+ from compare_significance import check_significance
11
+ from model_compare import ModelCompare
12
+
13
+ api = HfApi()
14
+
15
+ ORG = "CZLC"
16
+ REPO = f"{ORG}/LLM_benchmark_data"
17
+ HF_TOKEN = os.environ.get("HF_TOKEN")
18
+ TASKS_METADATA_PATH = "./tasks_metadata.json"
19
+
20
+
21
+ class LeaderboardServer:
22
+ def __init__(self):
23
+ self.server_address = REPO
24
+ self.repo_type = "dataset"
25
+ self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
26
+ local_dir="./")
27
+ self.submisssion_id_to_file = {} # Map submission ids to file paths
28
+ self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))['tasks']
29
+ self.submission_ids = set()
30
+ self.comparer = ModelCompare(self.tasks_metadata.keys())
31
+ self.fetch_existing_models()
32
+ self.tournament_results = self.load_tournament_results()
33
+ self.pre_submit = None
34
+
35
+ def update_leaderboard(self):
36
+ self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
37
+ local_dir="./")
38
+ self.fetch_existing_models()
39
+ self.tournament_results = self.load_tournament_results()
40
+
41
+ def load_tournament_results(self):
42
+ metadata_rank_paths = os.path.join(self.local_leaderboard, "tournament.json")
43
+ if not os.path.exists(metadata_rank_paths):
44
+ return {}
45
+ with open(metadata_rank_paths) as ranks_file:
46
+ results = json.load(ranks_file)
47
+ return results
48
+
49
+ def fetch_existing_models(self):
50
+ # Models data
51
+ for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
52
+ data = json.load(open(submission))
53
+ metadata = data.get('metadata')
54
+ if metadata is None:
55
+ continue
56
+ submission_id = metadata["team_name"] + "_" + metadata["submission_id"]
57
+ self.submission_ids.add(submission_id)
58
+
59
+ self.submisssion_id_to_file[submission_id] = submission
60
+
61
+ def get_leaderboard(self, tournament_results=None):
62
+ rank_based_on = tournament_results if tournament_results else self.tournament_results
63
+
64
+ if len(rank_based_on) == 0:
65
+ return pd.DataFrame(columns=['No submissions yet'])
66
+ else:
67
+ ranks = self.comparer.get_tasks_ranks(rank_based_on)
68
+ results = []
69
+ for submission in rank_based_on.keys():
70
+ path = self.submisssion_id_to_file.get(submission)
71
+ if path is None:
72
+ if self.pre_submit and submission == self.pre_submit[1]:
73
+ data = json.load(open(self.pre_submit[2]))
74
+ else:
75
+ raise gr.Error(f"Internal error: Submission [{submission}] not found")
76
+ elif path:
77
+ data = json.load(open(path))
78
+ else:
79
+ raise gr.Error(f"Submission [{submission}] not found")
80
+ submission_id = data["metadata"]["team_name"] + "_" + data["metadata"]["submission_id"]
81
+
82
+ local_results = {task: list(task_ranks).index(submission_id) + 1 for task, task_ranks in ranks.items()}
83
+ local_results["submission_id"] = submission_id
84
+ if self.pre_submit and submission == self.pre_submit[1]:
85
+ results.insert(0, local_results)
86
+ else:
87
+ results.append(local_results)
88
+ dataframe = pd.DataFrame.from_records(results)
89
+ df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
90
+ dataframe = dataframe[df_order]
91
+ dataframe = dataframe.rename(columns={key: value["name"] for key, value in self.tasks_metadata.items()})
92
+ return dataframe
93
+
94
+ def start_tournament(self, new_model_id, new_model_file):
95
+ new_tournament = copy.deepcopy(self.tournament_results)
96
+ new_tournament[new_model_id] = {}
97
+ new_tournament[new_model_id][new_model_id] = {task: False for task in self.tasks_metadata.keys()}
98
+
99
+ for model in self.submission_ids:
100
+ res = check_significance(new_model_file, self.submisssion_id_to_file[model])
101
+ res_inverse = check_significance(self.submisssion_id_to_file[model], new_model_file)
102
+ new_tournament[new_model_id][model] = {
103
+ task: data["significant"] for task, data in res.items()
104
+ }
105
+ new_tournament[model][new_model_id] = {
106
+ task: data["significant"] for task, data in res_inverse.items()
107
+ }
108
+ return new_tournament
109
+
110
+ def prepare_model_for_submission(self, file, metadata) -> None:
111
+ with open(file, "r") as f:
112
+ data = json.load(f)
113
+ data["metadata"] = metadata
114
+ with open(file, "w") as f:
115
+ json.dump(data, f)
116
+
117
+ model_id = metadata["team_name"] + "_" + metadata["submission_id"]
118
+ tournament_results = self.start_tournament(model_id, file)
119
+ self.pre_submit = tournament_results, model_id, file
120
+
121
+ def save_pre_submit(self):
122
+ if self.pre_submit:
123
+ tournament_results, model_id, file = self.pre_submit
124
+ filename = os.path.basename(file)
125
+ api.upload_file(
126
+ path_or_fileobj=file,
127
+ path_in_repo=f"data/{model_id}_{filename}",
128
+ repo_id=self.server_address,
129
+ repo_type=self.repo_type,
130
+ token=HF_TOKEN,
131
+ )
132
+
133
+ # Temporary save tournament results
134
+ tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
135
+ with open(tournament_results_path, "w") as f:
136
+ json.dump(tournament_results, f)
137
+
138
+ api.upload_file(
139
+ path_or_fileobj=tournament_results_path,
140
+ path_in_repo="tournament.json",
141
+ repo_id=self.server_address,
142
+ repo_type=self.repo_type,
143
+ token=HF_TOKEN,
144
+ )
tasks_metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tasks": {
3
+ "benczechmark_propaganda_argumentace": {
4
+ "task": "benczechmark_propaganda_argumentace",
5
+ "name": "P-Argumentace",
6
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_argumentace"
7
+ },
8
+ "benczechmark_propaganda_fabulace": {
9
+ "task": "benczechmark_propaganda_fabulace",
10
+ "name": "P-Fabulace",
11
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_fabulace"
12
+ },
13
+ "benczechmark_propaganda_nazor": {
14
+ "task": "benczechmark_propaganda_nazor",
15
+ "name": "P-Názor",
16
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_nazor"
17
+ },
18
+ "benczechmark_propaganda_strach": {
19
+ "task": "benczechmark_propaganda_strach",
20
+ "name": "P-Strach",
21
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_strach"
22
+ },
23
+ "benczechmark_propaganda_zamereni": {
24
+ "task": "benczechmark_propaganda_zamereni",
25
+ "name": "P-Zaměření",
26
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_zamereni"
27
+ },
28
+ "benczechmark_propaganda_demonizace": {
29
+ "task": "benczechmark_propaganda_demonizace",
30
+ "name": "P-Demonizace",
31
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_demonizace"
32
+ },
33
+ "benczechmark_propaganda_lokace": {
34
+ "task": "benczechmark_propaganda_lokace",
35
+ "name": "P-Lokace",
36
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_lokace"
37
+ },
38
+ "benczechmark_propaganda_relativizace": {
39
+ "task": "benczechmark_propaganda_relativizace",
40
+ "name": "P-Relativizace",
41
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_relativizace"
42
+ },
43
+ "benczechmark_propaganda_vina": {
44
+ "task": "benczechmark_propaganda_vina",
45
+ "name": "P-Vina",
46
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_vina"
47
+ },
48
+ "benczechmark_propaganda_zanr": {
49
+ "task": "benczechmark_propaganda_zanr",
50
+ "name": "P-Žánr",
51
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_zanr"
52
+ },
53
+ "benczechmark_propaganda_emoce": {
54
+ "task": "benczechmark_propaganda_emoce",
55
+ "name": "P-Emoce",
56
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_emoce"
57
+ },
58
+ "benczechmark_propaganda_nalepkovani": {
59
+ "task": "benczechmark_propaganda_nalepkovani",
60
+ "name": "P-Nalepkování",
61
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_nalepkovani"
62
+ },
63
+ "benczechmark_propaganda_rusko": {
64
+ "task": "benczechmark_propaganda_rusko",
65
+ "name": "P-Rusko",
66
+ "source_url": "https://huggingface.co/datasets/CZLC/propaganda_rusko"
67
+ },
68
+ "benczechmark_sentiment_mall": {
69
+ "task": "benczechmark_sentiment_mall",
70
+ "name": "S-Mall",
71
+ "source_url": "https://huggingface.co/datasets/CZLC/mall_sentiment_balanced"
72
+ },
73
+ "benczechmark_sentiment_fb": {
74
+ "task": "benczechmark_sentiment_fb",
75
+ "name": "S-FB",
76
+ "source_url": "https://huggingface.co/datasets/CZLC/fb_sentiment_balanced"
77
+ },
78
+ "benczechmark_sentiment_csfd": {
79
+ "task": "benczechmark_sentiment_csfd",
80
+ "name": "S-CSFD",
81
+ "source_url": "https://huggingface.co/datasets/CZLC/csfd_sentiment_balanced"
82
+ },
83
+ "benczechmark_summarization": {
84
+ "task": "benczechmark_summarization",
85
+ "name": "Summarization",
86
+ "source_url": "https://huggingface.co/datasets/CZLC/sumeczech_downsampled"
87
+ },
88
+ "benczechmark_grammarerrorcorrection": {
89
+ "task": "benczechmark_grammarerrorcorrection",
90
+ "name": "Grammar Error Correction",
91
+ "source_url": "https://huggingface.co/datasets/CZLC/cs_gec"
92
+ },
93
+ "benczechmark_cs_naturalquestions": {
94
+ "task": "benczechmark_cs_naturalquestions",
95
+ "name": "CS Natural Questions",
96
+ "source_url": "https://huggingface.co/datasets/CZLC/cs_naturalquestions"
97
+ },
98
+ "benczechmark_cs_sqad32": {
99
+ "task": "benczechmark_cs_sqad32",
100
+ "name": "CS SQAD 3.2",
101
+ "source_url": "https://huggingface.co/datasets/CZLC/SQAD_3.2"
102
+ },
103
+ "benczechmark_cs_triviaQA": {
104
+ "task": "benczechmark_cs_triviaQA",
105
+ "name": "CS TriviaQA",
106
+ "source_url": "https://huggingface.co/datasets/CZLC/cs_triviaqa"
107
+ },
108
+ "benczechmark_csfever_nli": {
109
+ "task": "benczechmark_csfever_nli",
110
+ "name": "CSFever NLI",
111
+ "source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/csfever_nli"
112
+ },
113
+ "benczechmark_ctkfacts_nli": {
114
+ "task": "benczechmark_ctkfacts_nli",
115
+ "name": "CTKFacts NLI",
116
+ "source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/ctkfacts_nli"
117
+ },
118
+ "benczechmark_cs_ner": {
119
+ "task": "benczechmark_cs_ner",
120
+ "name": "CS NER",
121
+ "source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-named-entity-corpus_2.0"
122
+ },
123
+ "benczechmark_hellaswag": {
124
+ "task": "benczechmark_hellaswag",
125
+ "name": "HellaSwag",
126
+ "source_url": "https://huggingface.co/datasets/CZLC/cs_hellaswag"
127
+ },
128
+ "benczechmark_histcorpus": {
129
+ "task": "benczechmark_histcorpus",
130
+ "name": "HistCorpus",
131
+ "source_url": "https://huggingface.co/datasets/CZLC/benczechmark_histcorpus"
132
+ },
133
+ "benczechmark_klokan_qa": {
134
+ "task": "benczechmark_klokan_qa",
135
+ "name": "Klokan QA",
136
+ "source_url": "https://huggingface.co/datasets/hynky/klokan-qa"
137
+ },
138
+ "benczechmark_cs_court_decisions_ner": {
139
+ "task": "benczechmark_cs_court_decisions_ner",
140
+ "name": "CS Court Decisions NER",
141
+ "source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-court-decisions-ner"
142
+ },
143
+ "benczechmark_umimeto_biology": {
144
+ "task": "benczechmark_umimeto_biology",
145
+ "name": "Umimeto.cz - Biology",
146
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-biology"
147
+ },
148
+ "benczechmark_umimeto_chemistry": {
149
+ "task": "benczechmark_umimeto_chemistry",
150
+ "name": "Umimeto.cz - Chemistry",
151
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-chemistry"
152
+ },
153
+ "benczechmark_umimeto_czech": {
154
+ "task": "benczechmark_umimeto_czech",
155
+ "name": "Umimeto.cz - Czech",
156
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-czech"
157
+ },
158
+ "benczechmark_umimeto_history": {
159
+ "task": "benczechmark_umimeto_history",
160
+ "name": "Umimeto.cz - History",
161
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-history"
162
+ },
163
+ "benczechmark_umimeto_informatics": {
164
+ "task": "benczechmark_umimeto_informatics",
165
+ "name": "Umimeto.cz - Informatics",
166
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-informatics"
167
+ },
168
+ "benczechmark_umimeto_math": {
169
+ "task": "benczechmark_umimeto_math",
170
+ "name": "Umimeto.cz - Math",
171
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-math"
172
+ },
173
+ "benczechmark_umimeto_physics": {
174
+ "task": "benczechmark_umimeto_physics",
175
+ "name": "Umimeto.cz - Physics",
176
+ "source_url": "https://huggingface.co/datasets/CZLC/umimeto-physics"
177
+ },
178
+ "benczechmark_cermat_czmath_mc": {
179
+ "task": "benczechmark_cermat_czmath_mc",
180
+ "name": "Cermat Czech Math MC",
181
+ "source_url": "https://huggingface.co/datasets/CZLC/cermat_math_mc"
182
+ },
183
+ "benczechmark_cermat_czmath_open": {
184
+ "task": "benczechmark_cermat_czmath_open",
185
+ "name": "Cermat Czech Math Open",
186
+ "source_url": "https://huggingface.co/datasets/CZLC/cermat_math_open"
187
+ },
188
+ "benczechmark_cermat_czech_tf": {
189
+ "task": "benczechmark_cermat_czech_tf",
190
+ "name": "Cermat Czech Language TF",
191
+ "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_tf"
192
+ },
193
+ "benczechmark_cermat_czech_mc": {
194
+ "task": "benczechmark_cermat_czech_mc",
195
+ "name": "Cermat Czech Language MC",
196
+ "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_mc"
197
+ },
198
+ "benczechmark_cermat_czech_open": {
199
+ "task": "benczechmark_cermat_czech_open",
200
+ "name": "Cermat Czech Language Open",
201
+ "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_open"
202
+ }
203
+ }
204
+ }