jstetina commited on
Commit
49d6897
1 Parent(s): f43bf19
Files changed (4) hide show
  1. app.py +128 -31
  2. compare_significance.py +231 -0
  3. model_compare.py +62 -0
  4. requirements.txt +4 -1
app.py CHANGED
@@ -18,13 +18,14 @@ import gradio as gr
18
 
19
  from huggingface_hub import HfApi, snapshot_download
20
 
 
 
21
 
22
  JSON_DATASET_DIR = Path("../json_dataset")
23
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
24
 
25
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
26
 
27
-
28
  api = HfApi()
29
 
30
  ORG= "CZLC"
@@ -38,28 +39,81 @@ DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
38
 
39
  HF_TOKEN = os.environ.get("HF_TOKEN")
40
 
41
-
42
-
43
  class LeaderboardServer:
44
  def __init__(self, server_address):
45
  self.server_address = server_address
46
  self.repo_type = "dataset"
47
- self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
48
- print(self.local_leaderboard)
 
49
  def on_submit(self):
50
  self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
51
 
52
  def get_leaderboard(self):
53
  results = []
54
- print(os.listdir(self.local_leaderboard))
 
 
 
 
 
 
 
 
 
 
55
  for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
56
  data = json.load(open(submission))
57
  submission_id = data["metadata"]["model_description"]
58
- local_results = {group: data["results"][group]['acc'] for group in data['results']}
 
 
 
 
 
 
 
 
59
  local_results["submission_id"] = submission_id
60
  results.append(local_results)
61
  dataframe = pd.DataFrame.from_records(results)
 
 
 
62
  return dataframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def save_json(self,file, submission_name) -> None:
65
  filename = os.path.basename(file)
@@ -72,11 +126,7 @@ class LeaderboardServer:
72
  )
73
 
74
 
75
-
76
-
77
  leaderboard_server = LeaderboardServer(REPO)
78
-
79
-
80
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
81
 
82
 
@@ -86,6 +136,8 @@ MAX_SUBMISSIONS_PER_24H = 2
86
  # CHALLENGE_NAME = 'NOTSOFAR1'
87
 
88
 
 
 
89
  # if __name__ == '__main__':
90
  with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
91
  app_state = gr.State({})
@@ -136,9 +188,9 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
136
  return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
137
 
138
 
139
- with gr.Tab('Leaderboards') as leaderboards_tab:
140
- with gr.Row():
141
- gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
142
  # with gr.Row():
143
  # with gr.Column():
144
  # dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
@@ -150,17 +202,35 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
150
  # gr.Markdown('') # Empty column for spacing
151
  # with gr.Column():
152
  # gr.Markdown('') # Empty column for spacing
153
- with gr.Row():
154
- with gr.Tabs() as leaderboards_tabs:
155
- leaderboard_tables_list = []
156
- for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
157
- l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
158
- leaderboard_tables_list.append(l_tab)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  # dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
161
  # outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
162
 
 
163
 
 
164
  # Submission Tab #
165
  ##################
166
  with gr.Tab('Submission'):
@@ -217,30 +287,57 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
217
  # leaderboard_tab.render()
218
  return gr.update(value='Submit', interactive=True)
219
 
220
- gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  submission_team_name_tb = gr.Textbox(label='Team Name')
222
- submission_file_path = gr.File(label='Upload your results', type='filepath')
223
- submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
 
 
 
224
  with gr.Row():
225
  hf_token_tb = gr.Textbox(label='Token', type='password')
226
  submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
227
- description_tb = gr.Textbox(label='Description', type='text')
228
- submission_btn = gr.Button(value='Submit', interactive=True)
229
-
 
 
 
 
 
 
 
 
 
230
  submission_btn.click(
231
  fn=on_submit_pressed,
232
  outputs=[submission_btn]
233
  ).then(
234
  fn=process_submission,
235
- inputs=[submission_team_name_tb, submission_file_path,
236
- submission_type_radio, description_tb, app_state]
237
  ).then(
238
  fn=on_submit_done,
239
  outputs=[submission_btn]
240
- ).then(
241
- fn=on_dropdown_change,
242
- outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
243
  )
 
 
 
 
 
 
244
 
245
  # # My Submissions Tab #
246
  # ######################
 
18
 
19
  from huggingface_hub import HfApi, snapshot_download
20
 
21
+ from compare_significance import check_significance, SUPPORTED_METRICS
22
+ from model_compare import ModelCompare
23
 
24
  JSON_DATASET_DIR = Path("../json_dataset")
25
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
26
 
27
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
28
 
 
29
  api = HfApi()
30
 
31
  ORG= "CZLC"
 
39
 
40
  HF_TOKEN = os.environ.get("HF_TOKEN")
41
 
 
 
42
  class LeaderboardServer:
43
  def __init__(self, server_address):
44
  self.server_address = server_address
45
  self.repo_type = "dataset"
46
+ self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
47
+ self.submisssion_id_to_file = {} # Map submission ids to file paths
48
+
49
  def on_submit(self):
50
  self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
51
 
52
  def get_leaderboard(self):
53
  results = []
54
+
55
+ new_results = []
56
+ submission_ids = set()
57
+
58
+ # pre-computed ranks
59
+ with open(os.path.join(self.local_leaderboard, "metadata", "ranks.json")) as ranks_file:
60
+ ranks = json.load(ranks_file)
61
+ model_compare = ModelCompare()
62
+ ranks = model_compare.get_tasks_ranks(ranks)
63
+
64
+ # Models data
65
  for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
66
  data = json.load(open(submission))
67
  submission_id = data["metadata"]["model_description"]
68
+
69
+ if submission_id in submission_ids:
70
+ continue
71
+ submission_ids.add(submission_id)
72
+
73
+ self.submisssion_id_to_file[submission_id] = submission
74
+
75
+
76
+ local_results = {task: list(task_ranks).index(submission_id)+1 for task, task_ranks in ranks.items()}
77
  local_results["submission_id"] = submission_id
78
  results.append(local_results)
79
  dataframe = pd.DataFrame.from_records(results)
80
+ # Reorder to have the id (model description) first
81
+ df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
82
+ dataframe = dataframe[df_order]
83
  return dataframe
84
+
85
+ def compute_ranks(self):
86
+ ''' Compute rankings on every submit '''
87
+
88
+ self.get_leaderboard()
89
+
90
+ ids = list(self.submisssion_id_to_file.keys())
91
+ rankings = {id: {} for id in ids}
92
+
93
+ for a_idx in range(len(ids)):
94
+ for b_idx in range(a_idx+1, len(ids)):
95
+ modelA_id = ids[a_idx]
96
+ modelB_id = ids[b_idx]
97
+ res = self.compare_models(modelA_id, modelB_id)
98
+ rankings[modelA_id][modelB_id] = {
99
+ task: data["significant"] for task,data in res.items()
100
+ }
101
+ rankings[modelB_id][modelA_id] = {
102
+ task: not data["significant"] for task,data in res.items()
103
+ }
104
+
105
+ return rankings
106
+
107
+
108
+ def compare_models(self, modelA, modelB):
109
+ modelA_path = self.submisssion_id_to_file.get(modelA)
110
+ modelB_path = self.submisssion_id_to_file.get(modelB)
111
+ return check_significance(modelA_path, modelB_path)
112
+
113
+
114
+ def get_rankings(self):
115
+ # TODO retrieve saved rankings for models on tasks
116
+ pass
117
 
118
  def save_json(self,file, submission_name) -> None:
119
  filename = os.path.basename(file)
 
126
  )
127
 
128
 
 
 
129
  leaderboard_server = LeaderboardServer(REPO)
 
 
130
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
131
 
132
 
 
136
  # CHALLENGE_NAME = 'NOTSOFAR1'
137
 
138
 
139
+
140
+
141
  # if __name__ == '__main__':
142
  with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
143
  app_state = gr.State({})
 
188
  return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
189
 
190
 
191
+ with gr.Tab('Leaderboard') as leaderboards_tab:
192
+ # with gr.Row():
193
+ # gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
194
  # with gr.Row():
195
  # with gr.Column():
196
  # dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
 
202
  # gr.Markdown('') # Empty column for spacing
203
  # with gr.Column():
204
  # gr.Markdown('') # Empty column for spacing
205
+ # with gr.Row():
206
+ # with gr.Tabs() as leaderboards_tabs:
207
+ # leaderboard_tables_list = []
208
+ # for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
209
+ # l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
210
+ # leaderboard_tables_list.append(l_tab)
211
+
212
+ # change the table based on the selected model
213
+ def on_dropdown_change(model_detail):
214
+ leaderboard = leaderboard_server.get_leaderboard()
215
+ return leaderboard[leaderboard["submission_id"] == model_detail]
216
+
217
+ results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None, visible=True)
218
+ model_detail = gr.Dropdown(choices=list(leaderboard_server.get_leaderboard()["submission_id"]), label="Select model", interactive=True)
219
+ model_detail_button = gr.Button("Show model detail", interactive=True)
220
+ model_detail_button.click(
221
+ fn=on_dropdown_change,
222
+ inputs=[model_detail],
223
+ outputs=[results_table]
224
+ )
225
+
226
+ # results_table.select(fn=on_dropdown_change, inputs=[model_detail], outputs=[results_table])
227
 
228
  # dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
229
  # outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
230
 
231
+
232
 
233
+ ##################
234
  # Submission Tab #
235
  ##################
236
  with gr.Tab('Submission'):
 
287
  # leaderboard_tab.render()
288
  return gr.update(value='Submit', interactive=True)
289
 
290
+ def show_leaderboard():
291
+ gr.Info("Loding leaderboard...")
292
+ return leaderboard_server.get_leaderboard()
293
+
294
+ gr.Markdown(
295
+ """
296
+ # Model submission
297
+ Model can be compared with other models and submitted\n
298
+ Click **Compare results** to compare your model with other models in the leaderboard\n
299
+ Click **Submit results** to submit your model to the leaderboard
300
+ (Comparison by itself is not a submission)
301
+ """
302
+ )
303
+
304
  submission_team_name_tb = gr.Textbox(label='Team Name')
305
+ # submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
306
+ with gr.Row():
307
+ description_tb = gr.Textbox(label='Description', type='text')
308
+ link_to_model_tb = gr.Textbox(label='Link to model', type='text')
309
+
310
  with gr.Row():
311
  hf_token_tb = gr.Textbox(label='Token', type='password')
312
  submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
313
+
314
+ submission_file_path = gr.File(label='Upload your results', type='filepath')
315
+ compare_results_button = gr.DataFrame(show_leaderboard(), interactive=False, label=None, visible=True)
316
+
317
+ # Button that triggers shows the current leaderboard
318
+ show_results_button = gr.Button("Compare results", interactive=True)
319
+ show_results_button.click(
320
+ fn=show_leaderboard,
321
+ outputs=[compare_results_button]
322
+ )
323
+
324
+ submission_btn = gr.Button(value='Submit results', interactive=True)
325
  submission_btn.click(
326
  fn=on_submit_pressed,
327
  outputs=[submission_btn]
328
  ).then(
329
  fn=process_submission,
330
+ inputs=[submission_team_name_tb, submission_file_path, description_tb, app_state]
 
331
  ).then(
332
  fn=on_submit_done,
333
  outputs=[submission_btn]
 
 
 
334
  )
335
+
336
+ # .then(
337
+ # fn=on_dropdown_change,
338
+ # outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
339
+ # )
340
+
341
 
342
  # # My Submissions Tab #
343
  # ######################
compare_significance.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from collections import defaultdict
4
+ from typing import Sequence
5
+
6
+ import numpy
7
+ import numpy as np
8
+ from scipy.stats import ttest_ind, ttest_rel
9
+ from sklearn.metrics import roc_curve, auc
10
+ from tqdm import tqdm
11
+
12
+ # from leaderboard import SUPPORTED_METRICS
13
+
14
+ SUPPORTED_METRICS = [
15
+ "avg_mcauroc", # for classification tasks
16
+ "em", # for QA tasks
17
+ "acc", # for multichoice tasks
18
+ "rouge", # for summarization tasks
19
+ "ppl", # for language modeling tasks
20
+ ]
21
+
22
+
23
+ def _get_CMs(i, probabilities, references, thresholds):
24
+ confusion_matrices = []
25
+ for threshold in thresholds[i]:
26
+ TP = 0
27
+ FP = 0
28
+ TN = 0
29
+ FN = 0
30
+ for j in range(len(probabilities)):
31
+ if probabilities[j][i] >= threshold:
32
+ if references[j] == i:
33
+ TP += 1
34
+ else:
35
+ FP += 1
36
+ else:
37
+ if references[j] == i:
38
+ FN += 1
39
+ else:
40
+ TN += 1
41
+ cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i}
42
+ confusion_matrices.append(cm)
43
+
44
+ return confusion_matrices
45
+
46
+
47
+ def compute_significance_accuracy(predsA, referencesA, predsB, referencesB):
48
+ # following https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L164C32-L165C24
49
+ # Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half
50
+ scores_A = [1 if pred == ref else 0 for pred, ref in zip(predsA, referencesA)]
51
+ scores_B = [1 if pred == ref else 0 for pred, ref in zip(predsB, referencesB)]
52
+ t, p = ttest_rel(scores_A, scores_B)
53
+ # correct for one-tailed test
54
+ p_value = p / 2
55
+ delta = np.mean(scores_A) - np.mean(scores_B)
56
+ return p_value, delta
57
+
58
+ def compute_significance_em(predsA, referencesA, predsB, referencesB):
59
+ pass
60
+ def compute_significance_rouge(predsA, referencesA, predsB, referencesB):
61
+ # TODO: MDocekal
62
+ # Use bootstrapping
63
+ # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
64
+ pass
65
+ def compute_significance_ppl(predsA, referencesA, predsB, referencesB):
66
+ # TODO: MDocekal
67
+ # Use bootstrapping
68
+ # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
69
+ pass
70
+ def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
71
+ probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
72
+ # compute MC-AUC for model A
73
+ model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=1_000)
74
+ model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=1_000)
75
+
76
+ # one-tailed test
77
+ p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
78
+ / (len(model_A_scores) * len(model_B_scores)))
79
+
80
+ delta = np.mean(model_A_scores) - np.mean(model_B_scores)
81
+ return p_value, delta
82
+
83
+
84
+ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
85
+ n_classes = list(range(len(probs[0])))
86
+ fpr = dict()
87
+ thresholds = dict()
88
+ # compute AUC for every class
89
+ auc_scores_per_class = []
90
+ for i in range(len(n_classes)):
91
+ # for i-th class vs all others
92
+ fpr[i], _, thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references],
93
+ y_score=[prob[i] for prob in probs])
94
+
95
+ confusion_matrices = _get_CMs(i, probs, references, thresholds)
96
+
97
+ λ = 1.0 # <- Flat prior
98
+ # λ = 0.5 # <- Jeffrey's prior
99
+
100
+ # sample variates for every threshold
101
+ tpr_variates_for_each_fpr = []
102
+ for k in range(len(thresholds[i])):
103
+ tpr_variates_for_each_fpr.append(
104
+ numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
105
+
106
+ # fprs x tpr_variates
107
+ tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
108
+
109
+ # now pick 1 variate for each fpr, and compute AUC
110
+ auc_scores = []
111
+ for tpr_variates in tqdm(tpr_variates_for_each_fpr.T,
112
+ desc=f"Computing AUCs for class {i + 1}/{len(n_classes)}"):
113
+ auc_score = auc(fpr[i], tpr_variates)
114
+ # if numpy.isnan(auc_score):
115
+ # auc_score = 0
116
+ auc_scores.append(auc_score)
117
+ auc_scores_per_class.append(auc_scores)
118
+
119
+ auc_scores_per_class = np.array(auc_scores_per_class)
120
+ mcauc_scores = np.mean(auc_scores_per_class, axis=0)
121
+ return mcauc_scores
122
+
123
+
124
+ def read_json(file_path):
125
+ data = defaultdict(list)
126
+ with open(file_path, "r") as f:
127
+ fc = json.load(f)
128
+ for task, results in fc["predictions"].items():
129
+ # determine the metric
130
+ metric = None
131
+ for key in SUPPORTED_METRICS:
132
+ if key in results[0]:
133
+ metric = key
134
+ break
135
+ if metric is None:
136
+ raise ValueError(f"Unsupported metric in {file_path}")
137
+
138
+ if metric == "avg_mcauroc":
139
+ local_data = [line[metric] for line in fc["predictions"][task]]
140
+ unzipped_list = list(zip(*local_data))
141
+ golds = unzipped_list[0]
142
+ probs = unzipped_list[1]
143
+ data[task] = (golds, probs), metric
144
+ return data, fc["metadata"]
145
+
146
+
147
+ def check_significance_task(fileA, fileB, task, significance_level=0.05):
148
+
149
+ dataA, metadataA = read_json(fileA)
150
+ dataB, metadataB = read_json(fileB)
151
+
152
+ print("DEBUG",fileA, task, dataA[task])
153
+
154
+ decisions = dict()
155
+
156
+ metricA = dataA[task][1]
157
+ metricB = dataB[task][1]
158
+ assert metricA == metricB
159
+ assert len(dataA[task]) == len(dataB[task])
160
+
161
+ if metricA == "avg_mcauroc":
162
+ p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
163
+ probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
164
+
165
+ elif metricA == "acc":
166
+ p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
167
+ predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
168
+ elif metricA == "em":
169
+ raise NotImplementedError("Exact match is not supported yet.")
170
+ elif metricA == "rouge":
171
+ raise NotImplementedError("Rouge is not supported yet.")
172
+ elif metricA == "ppl":
173
+ raise NotImplementedError("Perplexity is not supported yet.")
174
+ else:
175
+ raise ValueError(f"Unsupported metric {metricA}")
176
+ decisions[task] = {
177
+ "significant": not (p_value > significance_level),
178
+ "p_value": p_value,
179
+ "delta": delta,
180
+ }
181
+ return decisions
182
+
183
+ def check_significance(fileA, fileB, significance_level=0.05):
184
+ dataA, metadataA = read_json(fileA)
185
+ dataB, metadataB = read_json(fileB)
186
+
187
+ decisions = dict()
188
+ for task in dataA.keys():
189
+ metricA = dataA[task][1]
190
+ metricB = dataB[task][1]
191
+ assert metricA == metricB
192
+ assert len(dataA[task]) == len(dataB[task])
193
+
194
+ if metricA == "avg_mcauroc":
195
+ p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
196
+ probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
197
+
198
+ elif metricA == "acc":
199
+ p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
200
+ predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
201
+ elif metricA == "em":
202
+ raise NotImplementedError("Exact match is not supported yet.")
203
+ elif metricA == "rouge":
204
+ raise NotImplementedError("Rouge is not supported yet.")
205
+ elif metricA == "ppl":
206
+ raise NotImplementedError("Perplexity is not supported yet.")
207
+ else:
208
+ raise ValueError(f"Unsupported metric {metricA}")
209
+ decisions[task] = {
210
+ "significant": not (p_value > significance_level),
211
+ "p_value": p_value,
212
+ "delta": delta,
213
+ }
214
+ return decisions
215
+
216
+
217
+ def main():
218
+ parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
219
+ parser.add_argument("--modelA", help="ModelA JSONL file from lm harness.")
220
+ parser.add_argument("--modelB", help="ModelB JSONL file from lm harness.")
221
+ parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
222
+ args = parser.parse_args()
223
+
224
+ result = check_significance(args.modelA, args.modelB, args.significance_level)
225
+ print(json.dumps(result, indent=2))
226
+
227
+ # harness already returns stderr estimate for sampling distribution
228
+ # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
229
+
230
+ if __name__ == "__main__":
231
+ main()
model_compare.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from functools import cmp_to_key
3
+ from compare_significance import check_significance
4
+
5
+ class ModelCompare():
6
+
7
+ TASKS = ["propaganda_demonizace",
8
+ "propaganda_vina",
9
+ "propaganda_relativizace",
10
+ "propaganda_argumentace",
11
+ "propaganda_lokace",
12
+ "propaganda_nazor",
13
+ "propaganda_emoce",
14
+ "propaganda_fabulace",
15
+ "propaganda_nalepkovani",
16
+ "propaganda_zamereni",
17
+ "propaganda_zanr",
18
+ "propaganda_rusko",
19
+ "propaganda_strach",
20
+ "benczechmark_sentiment"]
21
+
22
+ def __init__(self, ranks:dict=None):
23
+ self.ranks = ranks
24
+
25
+ def compare_models(self, modelA_id, modelB_id):
26
+ if not self.ranks:
27
+ raise Exception("Missing model rankings")
28
+
29
+ res = self.ranks[modelA_id][modelB_id][self.current_task]
30
+ if res == True:
31
+ return 1
32
+ elif res == False:
33
+ return -1
34
+ else:
35
+ return -1
36
+
37
+
38
+ def get_tasks_ranks(self, ranks:dict) -> dict:
39
+ '''Order models based on the significance improvement'''
40
+
41
+ self.ranks = ranks
42
+
43
+ tasks_ranks = {}
44
+
45
+ models = ranks.keys()
46
+ for task in self.TASKS:
47
+ self.current_task = task
48
+ tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
49
+ return tasks_ranks
50
+
51
+
52
+ # models = {
53
+ # model1 : {
54
+ # task1 : order_idx
55
+ # task2 : order_idx
56
+ # task3 : order_idx
57
+ # }
58
+ # }
59
+
60
+
61
+
62
+
requirements.txt CHANGED
@@ -4,4 +4,7 @@ azure-cosmos
4
  huggingface_hub
5
  requests
6
  Pyarrow
7
- tabulate
 
 
 
 
4
  huggingface_hub
5
  requests
6
  Pyarrow
7
+ tabulate
8
+ scipy
9
+ numpy
10
+ scikit-learn