abhinav-joshi commited on
Commit
2b8f89d
1 Parent(s): eb68762

add baseline results

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -7,15 +7,14 @@ from uploads import add_new_eval
7
 
8
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
9
  CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
10
- title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
11
- author = "Joshi, Abhinav and Paul, Shaunak Sharma, Akshat and Goyal, Pawan and Ghosh, Saptarshi and Modi, Ashutosh",
12
- booktitle = "Proceedings of the 62st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
13
- month = aug,
14
- year = "2024",
15
- address = "Bangkok, Thailand",
16
- publisher = "Association for Computational Linguistics",
17
- }
18
- }"""
19
 
20
  api = HfApi()
21
  TOKEN = os.environ.get("TOKEN", None)
@@ -27,7 +26,7 @@ def restart_space():
27
 
28
 
29
  # Function to load data from a given CSV file
30
- def baseline_load_data(tasks):
31
  # version = version.replace("%", "p")
32
  file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
33
  df = pd.read_csv(file_path)
@@ -46,6 +45,20 @@ def baseline_load_data(tasks):
46
  "SUMM",
47
  "Average",
48
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if tasks is None:
50
  breakpoint()
51
  # based on the tasks, remove the columns that are not needed
@@ -65,14 +78,77 @@ def baseline_load_data(tasks):
65
  column_names.remove("SUMM")
66
 
67
  df = df[column_names]
68
- df = df.sort_values(by="Average", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  df = df.drop_duplicates(subset=["Method"], keep="first")
70
 
71
  return df
72
 
73
 
74
- def load_data(tasks):
75
- baseline_df = baseline_load_data(tasks)
76
 
77
  return baseline_df
78
 
@@ -86,8 +162,29 @@ def search_leaderboard(df, query):
86
 
87
 
88
  # Function to change the version of the leaderboard
89
- def change_version(tasks):
90
- new_df = load_data(tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  return new_df
92
 
93
 
@@ -120,6 +217,57 @@ with demo:
120
  label="Select Tasks",
121
  choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
122
  value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  )
124
 
125
  with gr.Row():
@@ -128,10 +276,22 @@ with demo:
128
  show_label=False,
129
  )
130
 
 
 
 
 
 
 
 
 
 
 
 
131
  leaderboard_table = gr.components.Dataframe(
132
  value=load_data(
133
  # "baseline",
134
  ["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
 
135
  ),
136
  interactive=True,
137
  visible=True,
@@ -151,31 +311,156 @@ with demo:
151
 
152
  search_bar.change(
153
  search_leaderboard,
 
 
 
 
 
 
154
  inputs=[
155
- leaderboard_table,
156
- search_bar,
157
- # tasks_checkbox
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  ],
159
  outputs=leaderboard_table,
160
  )
161
-
162
  tasks_checkbox.change(
163
  change_version,
164
- inputs=[tasks_checkbox],
 
 
 
 
 
 
 
 
 
 
165
  outputs=leaderboard_table,
166
  )
167
 
168
- with gr.Accordion("Submit a new model for evaluation"):
169
  with gr.Row():
170
  with gr.Column():
171
- method_name_textbox = gr.Textbox(label="Method name")
172
- url_textbox = gr.Textbox(label="Url to model information")
173
- with gr.Column():
174
  organisation = gr.Textbox(label="Organisation")
175
  mail = gr.Textbox(label="Contact email")
 
176
  file_output = gr.File()
177
-
178
- submit_button = gr.Button("Submit Eval")
179
  submission_result = gr.Markdown()
180
  submit_button.click(
181
  add_new_eval,
@@ -221,5 +506,5 @@ with demo:
221
  scheduler = BackgroundScheduler()
222
  scheduler.add_job(restart_space, "interval", seconds=3600)
223
  scheduler.start()
224
- # demo.launch(debug=True)
225
- demo.launch(share=True)
 
7
 
8
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
9
  CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
10
+ title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
11
+ author = "Joshi, Abhinav and Paul, Shounak and Sharma, Akshat and Goyal, Pawan and Ghosh, Saptarshi and Modi, Ashutosh"
12
+ booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
13
+ month = aug,
14
+ year = "2024",
15
+ address = "Bangkok, Thailand",
16
+ publisher = "Association for Computational Linguistics",
17
+ }"""
 
18
 
19
  api = HfApi()
20
  TOKEN = os.environ.get("TOKEN", None)
 
26
 
27
 
28
  # Function to load data from a given CSV file
29
+ def baseline_load_data(tasks, task_metrics):
30
  # version = version.replace("%", "p")
31
  file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
32
  df = pd.read_csv(file_path)
 
45
  "SUMM",
46
  "Average",
47
  ]
48
+ # Method,Submitted by,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
49
+ column_names = [
50
+ "Method",
51
+ "Submitted By",
52
+ "L-NER",
53
+ "RR",
54
+ "CJPE",
55
+ "BAIL",
56
+ "LSI",
57
+ "PCR",
58
+ "SUMM",
59
+ # "Average",
60
+ ]
61
+
62
  if tasks is None:
63
  breakpoint()
64
  # based on the tasks, remove the columns that are not needed
 
78
  column_names.remove("SUMM")
79
 
80
  df = df[column_names]
81
+
82
+ import json
83
+
84
+ # load the results json file
85
+ with open("submissions/baseline/results.json") as f:
86
+ results = json.load(f)
87
+ # add the results to the dataframe
88
+ # Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
89
+ # Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
90
+ # create a new df to display the results
91
+ results_df = pd.DataFrame(
92
+ columns=[
93
+ "Method",
94
+ "Submitted By",
95
+ "Github Link",
96
+ "L-NER",
97
+ "RR",
98
+ "CJPE",
99
+ "BAIL",
100
+ "LSI",
101
+ "PCR",
102
+ "SUMM",
103
+ "L-MT",
104
+ # "Average",
105
+ ]
106
+ )
107
+ # breakpoint()
108
+ for entry in results:
109
+ results_df = results_df.append(
110
+ {
111
+ "Method": entry["Method"],
112
+ "Submitted By": entry["Submitted By"],
113
+ "Github Link": entry["Github Link"],
114
+ "L-NER": entry["L-NER"][task_metrics["L-NER"]],
115
+ "RR": entry["RR"][task_metrics["RR"]],
116
+ "CJPE": entry["CJPE"][task_metrics["CJPE"]],
117
+ "BAIL": entry["BAIL"][task_metrics["BAIL"]],
118
+ "LSI": entry["LSI"][task_metrics["LSI"]],
119
+ "PCR": entry["PCR"][task_metrics["PCR"]],
120
+ "SUMM": entry["SUMM"][task_metrics["SUMM"]],
121
+ "L-MT": entry["L-MT"][task_metrics["L-MT"]],
122
+ # "Average": ,
123
+ },
124
+ ignore_index=True,
125
+ )
126
+
127
+ # breakpoint()
128
+ # add the average column
129
+ # results_df["Average"] = results_df.mean(axis=1)
130
+
131
+ df = results_df
132
+ # df = df.sort_values(by="Average", ascending=False)
133
+ # remove the columns that are not in tasks
134
+ selected_columns = (
135
+ [
136
+ "Method",
137
+ "Submitted By",
138
+ ]
139
+ + tasks
140
+ + ["Github Link"]
141
+ )
142
+ print(tasks)
143
+ df = df[selected_columns]
144
+
145
  df = df.drop_duplicates(subset=["Method"], keep="first")
146
 
147
  return df
148
 
149
 
150
+ def load_data(tasks, task_metrics):
151
+ baseline_df = baseline_load_data(tasks, task_metrics)
152
 
153
  return baseline_df
154
 
 
162
 
163
 
164
  # Function to change the version of the leaderboard
165
+ def change_version(
166
+ tasks,
167
+ l_ner_metric,
168
+ rr_metric,
169
+ cjpe_metric,
170
+ bail_metric,
171
+ lsi_metric,
172
+ pcr_metric,
173
+ summ_metric,
174
+ lmt_metric,
175
+ ):
176
+ task_metrics = {
177
+ "L-NER": l_ner_metric,
178
+ "RR": rr_metric,
179
+ "CJPE": cjpe_metric,
180
+ "BAIL": bail_metric,
181
+ "LSI": lsi_metric,
182
+ "PCR": pcr_metric,
183
+ "SUMM": summ_metric,
184
+ "L-MT": lmt_metric,
185
+ }
186
+
187
+ new_df = load_data(tasks, task_metrics)
188
  return new_df
189
 
190
 
 
217
  label="Select Tasks",
218
  choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
219
  value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
220
+ interactive=True,
221
+ )
222
+
223
+ with gr.Row():
224
+ l_ner_metric = gr.Radio(
225
+ label="L-NER",
226
+ choices=["strict mF1"],
227
+ value="strict mF1",
228
+ interactive=True,
229
+ )
230
+ rr_metric = gr.Radio(
231
+ label="RR",
232
+ choices=["mF1"],
233
+ value="mF1",
234
+ interactive=True,
235
+ )
236
+ cjpe_metric = gr.Radio(
237
+ label="CJPE",
238
+ choices=["mF1", "ROUGE-L", "BLEU"],
239
+ value="mF1",
240
+ interactive=True,
241
+ )
242
+ bail_metric = gr.Radio(
243
+ label="BAIL",
244
+ choices=["mF1"],
245
+ value="mF1",
246
+ interactive=True,
247
+ )
248
+ lsi_metric = gr.Radio(
249
+ label="LSI",
250
+ choices=["mF1"],
251
+ value="mF1",
252
+ interactive=True,
253
+ )
254
+ pcr_metric = gr.Radio(
255
+ label="PCR",
256
+ choices=["muF1@K"],
257
+ value="muF1@K",
258
+ interactive=True,
259
+ )
260
+ summ_metric = gr.Radio(
261
+ label="SUMM",
262
+ choices=["ROUGE-L", "BERTSCORE"],
263
+ value="ROUGE-L",
264
+ interactive=True,
265
+ )
266
+ lmt_metric = gr.Radio(
267
+ label="L-MT",
268
+ choices=["BLEU", "GLEU", "chrF++"],
269
+ value="BLEU",
270
+ interactive=True,
271
  )
272
 
273
  with gr.Row():
 
276
  show_label=False,
277
  )
278
 
279
+ task_metrics = {
280
+ "L-NER": l_ner_metric.value,
281
+ "RR": rr_metric.value,
282
+ "CJPE": cjpe_metric.value,
283
+ "BAIL": bail_metric.value,
284
+ "LSI": lsi_metric.value,
285
+ "PCR": pcr_metric.value,
286
+ "SUMM": summ_metric.value,
287
+ "L-MT": lmt_metric.value,
288
+ }
289
+
290
  leaderboard_table = gr.components.Dataframe(
291
  value=load_data(
292
  # "baseline",
293
  ["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
294
+ task_metrics=task_metrics,
295
  ),
296
  interactive=True,
297
  visible=True,
 
311
 
312
  search_bar.change(
313
  search_leaderboard,
314
+ inputs=[leaderboard_table, search_bar],
315
+ outputs=leaderboard_table,
316
+ )
317
+ # breakpoint()
318
+ l_ner_metric.change(
319
+ change_version,
320
  inputs=[
321
+ tasks_checkbox,
322
+ l_ner_metric,
323
+ rr_metric,
324
+ cjpe_metric,
325
+ bail_metric,
326
+ lsi_metric,
327
+ pcr_metric,
328
+ summ_metric,
329
+ lmt_metric,
330
+ ],
331
+ outputs=leaderboard_table,
332
+ )
333
+ rr_metric.change(
334
+ change_version,
335
+ inputs=[
336
+ tasks_checkbox,
337
+ l_ner_metric,
338
+ rr_metric,
339
+ cjpe_metric,
340
+ bail_metric,
341
+ lsi_metric,
342
+ pcr_metric,
343
+ summ_metric,
344
+ lmt_metric,
345
+ ],
346
+ outputs=leaderboard_table,
347
+ )
348
+ cjpe_metric.change(
349
+ change_version,
350
+ inputs=[
351
+ tasks_checkbox,
352
+ l_ner_metric,
353
+ rr_metric,
354
+ cjpe_metric,
355
+ bail_metric,
356
+ lsi_metric,
357
+ pcr_metric,
358
+ summ_metric,
359
+ lmt_metric,
360
+ ],
361
+ outputs=leaderboard_table,
362
+ )
363
+ bail_metric.change(
364
+ change_version,
365
+ inputs=[
366
+ tasks_checkbox,
367
+ l_ner_metric,
368
+ rr_metric,
369
+ cjpe_metric,
370
+ bail_metric,
371
+ lsi_metric,
372
+ pcr_metric,
373
+ summ_metric,
374
+ lmt_metric,
375
+ ],
376
+ outputs=leaderboard_table,
377
+ )
378
+ lsi_metric.change(
379
+ change_version,
380
+ inputs=[
381
+ tasks_checkbox,
382
+ l_ner_metric,
383
+ rr_metric,
384
+ cjpe_metric,
385
+ bail_metric,
386
+ lsi_metric,
387
+ pcr_metric,
388
+ summ_metric,
389
+ lmt_metric,
390
+ ],
391
+ outputs=leaderboard_table,
392
+ )
393
+ pcr_metric.change(
394
+ change_version,
395
+ inputs=[
396
+ tasks_checkbox,
397
+ l_ner_metric,
398
+ rr_metric,
399
+ cjpe_metric,
400
+ bail_metric,
401
+ lsi_metric,
402
+ pcr_metric,
403
+ summ_metric,
404
+ lmt_metric,
405
+ ],
406
+ outputs=leaderboard_table,
407
+ )
408
+ summ_metric.change(
409
+ change_version,
410
+ inputs=[
411
+ tasks_checkbox,
412
+ l_ner_metric,
413
+ rr_metric,
414
+ cjpe_metric,
415
+ bail_metric,
416
+ lsi_metric,
417
+ pcr_metric,
418
+ summ_metric,
419
+ lmt_metric,
420
+ ],
421
+ outputs=leaderboard_table,
422
+ )
423
+ lmt_metric.change(
424
+ change_version,
425
+ inputs=[
426
+ tasks_checkbox,
427
+ l_ner_metric,
428
+ rr_metric,
429
+ cjpe_metric,
430
+ bail_metric,
431
+ lsi_metric,
432
+ pcr_metric,
433
+ summ_metric,
434
+ lmt_metric,
435
  ],
436
  outputs=leaderboard_table,
437
  )
 
438
  tasks_checkbox.change(
439
  change_version,
440
+ inputs=[
441
+ tasks_checkbox,
442
+ l_ner_metric,
443
+ rr_metric,
444
+ cjpe_metric,
445
+ bail_metric,
446
+ lsi_metric,
447
+ pcr_metric,
448
+ summ_metric,
449
+ lmt_metric,
450
+ ],
451
  outputs=leaderboard_table,
452
  )
453
 
454
+ with gr.Accordion("Submit the results of your Method"):
455
  with gr.Row():
456
  with gr.Column():
457
+ method_name_textbox = gr.Textbox(label="Method")
458
+ url_textbox = gr.Textbox(label="Github Link")
 
459
  organisation = gr.Textbox(label="Organisation")
460
  mail = gr.Textbox(label="Contact email")
461
+ with gr.Column():
462
  file_output = gr.File()
463
+ submit_button = gr.Button("Submit Eval")
 
464
  submission_result = gr.Markdown()
465
  submit_button.click(
466
  add_new_eval,
 
506
  scheduler = BackgroundScheduler()
507
  scheduler.add_job(restart_space, "interval", seconds=3600)
508
  scheduler.start()
509
+ demo.launch(debug=True)
510
+ # demo.launch(share=True)
dummy.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ # load the results json file
4
+ with open("submissions/baseline/results.json") as f:
5
+ results = json.load(f)
6
+
7
+
8
+ # update the results
9
+ with open("submissions/baseline/submission.json") as f:
10
+ submission = json.load(f)
11
+
12
+
13
+ breakpoint()
14
+ # update the results
15
+ results.append(submission[0])
submissions/.DS_Store CHANGED
Binary files a/submissions/.DS_Store and b/submissions/.DS_Store differ
 
submissions/baseline/baseline -pre2.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
2
+ Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
3
+ SOTA,various,48.58,69.01,81.31|56.00|32.00,81,28.08,39.15,33.00|86.00,28.00|32.00|57.00
4
+ BERT,various,39.59,58,71.14|-|-,-,18.44,9.24,-|-,-|-|-
5
+ LegalBERT,various,45.58,54,78.21|-|-,-,21.74,8.67,-|-,-|-|-
6
+ InLegalBERT,various,48.58,58,81.31|-|-,-,26.23,7.57,-|-,-|-|-
7
+ GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17|30.00|8.00,51.04,21.55,-,21.00|85.00,23.00|28.00|42.00
8
+ GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46|29.00|15.00,46.35,22.61,-,20.00|84.00,25.00|28.00|43.00
9
+ GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74|30.00|11.00,61,21.4,-,22.00|84.00,26.00|29.00|43.00
10
+ GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29|40.00|14.00,51.46,23.99,-,23.00|85.00,33.00|36.00|50.00
11
+ GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26|39.00|16.00,56.9,22.26,-,16.00|81.00,35.00|38.00|52.00
12
+ GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44|43.00|18.00,66.67,20.53,-,17.00|81.00,36.00|39.00|53.00
submissions/baseline/baseline-pre.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
2
+ ,0,baseline,baseline,0,0,0,0,0,0,0,0
3
+ ,0,baseline2,baseline2,0,0,0,0,0,0,0,0
4
+ ,0,baseline,baseline,0,0,0,0,0,0,0,0
5
+ ,0,random,random,0,0,0,0,0,0,0,0
6
+ ,0,random2,random22,0,0,0,0,0,0,0,0
7
+ ,0,random5,random55,0,0,0,0,0,0,0,0
submissions/baseline/baseline.csv CHANGED
@@ -1,7 +1,11 @@
1
- Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
2
- ,0,baseline,baseline,0,0,0,0,0,0,0,0
3
- ,0,baseline2,baseline2,0,0,0,0,0,0,0,0
4
- ,0,baseline,baseline,0,0,0,0,0,0,0,0
5
- ,0,random,random,0,0,0,0,0,0,0,0
6
- ,0,random2,random22,0,0,0,0,0,0,0,0
7
- ,0,random5,random55,0,0,0,0,0,0,0,0
 
 
 
 
 
1
+ Method,Submitted By,L-NER strict mF1,RR mF1,CJPE mF1,CJPE ROUGE-L,CJPE BLEU,BAIL mF1,LSI mF1,PCR muF1@K,SUMM ROUGE-L,SUMM BERTSCORE,L-MT BLEU,L-MT GLEU,L-MT chrF++
2
+ SOTA,various,48.58,69.01,81.31,56.00,32.00,81,28.08,39.15,33.00,86.00,28.00,32.00,57.00
3
+ BERT,various,39.59,58,71.14,-,-,-,18.44,9.24,-,-,-,-,-
4
+ LegalBERT,various,45.58,54,78.21,-,-,-,21.74,8.67,-,-,-,-,-
5
+ InLegalBERT,various,48.58,58,81.31,-,-,-,26.23,7.57,-,-,-,-,-
6
+ GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17,30.00,8.00,51.04,21.55,-,21.00,85.00,23.00,28.00,42.00
7
+ GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46,29.00,15.00,46.35,22.61,-,20.00,84.00,25.00,28.00,43.00
8
+ GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74,30.00,11.00,61,21.4,-,22.00,84.00,26.00,29.00,43.00
9
+ GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29,40.00,14.00,51.46,23.99,-,23.00,85.00,33.00,36.00,50.00
10
+ GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26,39.00,16.00,56.9,22.26,-,16.00,81.00,35.00,38.00,52.00
11
+ GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44,43.00,18.00,66.67,20.53,-,17.00,81.00,36.00,39.00,53.00
submissions/baseline/results-bacup.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Method": "SOTA",
4
+ "Submitted By": "multiple",
5
+ "Github Link": "exploration-lab.github.io/IL-TUR/",
6
+ "L-NER": {"strict mF1": "48.58"},
7
+ "RR": {"mF1": "69.01"},
8
+ "CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
9
+ "BAIL": {"mF1": "81"},
10
+ "LSI": {"mF1": "28.08"},
11
+ "PCR": {"muF1@K": "39.15"},
12
+ "SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
13
+ "L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
14
+ },
15
+ {
16
+ "Method": "BERT",
17
+ "Submitted By": "multiple",
18
+ "Github Link": "",
19
+ "L-NER": {"strict mF1": "39.59"},
20
+ "RR": {"mF1": "58"},
21
+ "CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
22
+ "BAIL": {"mF1": "-"},
23
+ "LSI": {"mF1": "-"},
24
+ "PCR": {"muF1@K": "18.44"},
25
+ "SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
26
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
27
+ },
28
+ {
29
+ "Method": "LegalBERT",
30
+ "Submitted By": "multiple",
31
+ "Github Link": "",
32
+ "L-NER": {"strict mF1": "45.58"},
33
+ "RR": {"mF1": "54"},
34
+ "CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
35
+ "BAIL": {"mF1": "-"},
36
+ "LSI": {"mF1": "-"},
37
+ "PCR": {"muF1@K": "21.74"},
38
+ "SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
39
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
40
+ },
41
+ {
42
+ "Method": "InLegalBERT",
43
+ "Submitted By": "multiple",
44
+ "Github Link": "",
45
+ "L-NER": {"strict mF1": "48.58"},
46
+ "RR": {"mF1": "58"},
47
+ "CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
48
+ "BAIL": {"mF1": "-"},
49
+ "LSI": {"mF1": "-"},
50
+ "PCR": {"muF1@K": "26.23"},
51
+ "SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
52
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
53
+ },
54
+ {
55
+ "Method": "GPT-3.5 (0-shot)",
56
+ "Submitted By": "IL-TUR",
57
+ "Github Link": "",
58
+ "L-NER": {"strict mF1": "30.59"},
59
+ "RR": {"mF1": "30.95"},
60
+ "CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
61
+ "BAIL": {"mF1": "51.04"},
62
+ "LSI": {"mF1": "21.55"},
63
+ "PCR": {"muF1@K": "-"},
64
+ "SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
65
+ "L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
66
+ },
67
+ {
68
+ "Method": "GPT-3.5 (1-shot)",
69
+ "Submitted By": "IL-TUR",
70
+ "Github Link": "",
71
+ "L-NER": {"strict mF1": "23.68"},
72
+ "RR": {"mF1": "30.05"},
73
+ "CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
74
+ "BAIL": {"mF1": "46.35"},
75
+ "LSI": {"mF1": "22.61"},
76
+ "PCR": {"muF1@K": "-"},
77
+ "SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
78
+ "L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
79
+ },
80
+ {
81
+ "Method": "GPT-3.5 (2-shot)",
82
+ "Submitted By": "IL-TUR",
83
+ "Github Link": "",
84
+ "L-NER": {"strict mF1": "32.84"},
85
+ "RR": {"mF1": "30.31"},
86
+ "CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
87
+ "BAIL": {"mF1": "61"},
88
+ "LSI": {"mF1": "21.4"},
89
+ "PCR": {"muF1@K": "-"},
90
+ "SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
91
+ "L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
92
+ },
93
+ {
94
+ "Method": "GPT-4 (0-shot)",
95
+ "Submitted By": "IL-TUR",
96
+ "Github Link": "",
97
+ "L-NER": {"strict mF1": "13.65"},
98
+ "RR": {"mF1": "37.37"},
99
+ "CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
100
+ "BAIL": {"mF1": "51.46"},
101
+ "LSI": {"mF1": "23.99"},
102
+ "PCR": {"muF1@K": "-"},
103
+ "SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
104
+ "L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
105
+ },
106
+ {
107
+ "Method": "GPT-4 (1-shot)",
108
+ "Submitted By": "IL-TUR",
109
+ "Github Link": "",
110
+ "L-NER": {"strict mF1": "10.51"},
111
+ "RR": {"mF1": "37.43"},
112
+ "CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
113
+ "BAIL": {"mF1": "56.9"},
114
+ "LSI": {"mF1": "22.26"},
115
+ "PCR": {"muF1@K": "-"},
116
+ "SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
117
+ "L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
118
+ },
119
+ {
120
+ "Method": "GPT-4 (2-shot)",
121
+ "Submitted By": "IL-TUR",
122
+ "Github Link": "",
123
+ "L-NER": {"strict mF1": "24.03"},
124
+ "RR": {"mF1": "38.18"},
125
+ "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
126
+ "BAIL": {"mF1": "66.67"},
127
+ "LSI": {"mF1": "20.53"},
128
+ "PCR": {"muF1@K": "-"},
129
+ "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
130
+ "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
131
+ }
132
+ ]
133
+
submissions/baseline/results.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Method": "SOTA",
4
+ "Submitted By": "multiple",
5
+ "Github Link": "exploration-lab.github.io/IL-TUR/",
6
+ "L-NER": {"strict mF1": "48.58"},
7
+ "RR": {"mF1": "69.01"},
8
+ "CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
9
+ "BAIL": {"mF1": "81"},
10
+ "LSI": {"mF1": "28.08"},
11
+ "PCR": {"muF1@K": "39.15"},
12
+ "SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
13
+ "L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
14
+ },
15
+ {
16
+ "Method": "BERT",
17
+ "Submitted By": "multiple",
18
+ "Github Link": "",
19
+ "L-NER": {"strict mF1": "39.59"},
20
+ "RR": {"mF1": "58"},
21
+ "CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
22
+ "BAIL": {"mF1": "-"},
23
+ "LSI": {"mF1": "-"},
24
+ "PCR": {"muF1@K": "18.44"},
25
+ "SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
26
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
27
+ },
28
+ {
29
+ "Method": "LegalBERT",
30
+ "Submitted By": "multiple",
31
+ "Github Link": "",
32
+ "L-NER": {"strict mF1": "45.58"},
33
+ "RR": {"mF1": "54"},
34
+ "CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
35
+ "BAIL": {"mF1": "-"},
36
+ "LSI": {"mF1": "-"},
37
+ "PCR": {"muF1@K": "21.74"},
38
+ "SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
39
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
40
+ },
41
+ {
42
+ "Method": "InLegalBERT",
43
+ "Submitted By": "multiple",
44
+ "Github Link": "",
45
+ "L-NER": {"strict mF1": "48.58"},
46
+ "RR": {"mF1": "58"},
47
+ "CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
48
+ "BAIL": {"mF1": "-"},
49
+ "LSI": {"mF1": "-"},
50
+ "PCR": {"muF1@K": "26.23"},
51
+ "SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
52
+ "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
53
+ },
54
+ {
55
+ "Method": "GPT-3.5 (0-shot)",
56
+ "Submitted By": "IL-TUR",
57
+ "Github Link": "",
58
+ "L-NER": {"strict mF1": "30.59"},
59
+ "RR": {"mF1": "30.95"},
60
+ "CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
61
+ "BAIL": {"mF1": "51.04"},
62
+ "LSI": {"mF1": "21.55"},
63
+ "PCR": {"muF1@K": "-"},
64
+ "SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
65
+ "L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
66
+ },
67
+ {
68
+ "Method": "GPT-3.5 (1-shot)",
69
+ "Submitted By": "IL-TUR",
70
+ "Github Link": "",
71
+ "L-NER": {"strict mF1": "23.68"},
72
+ "RR": {"mF1": "30.05"},
73
+ "CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
74
+ "BAIL": {"mF1": "46.35"},
75
+ "LSI": {"mF1": "22.61"},
76
+ "PCR": {"muF1@K": "-"},
77
+ "SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
78
+ "L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
79
+ },
80
+ {
81
+ "Method": "GPT-3.5 (2-shot)",
82
+ "Submitted By": "IL-TUR",
83
+ "Github Link": "",
84
+ "L-NER": {"strict mF1": "32.84"},
85
+ "RR": {"mF1": "30.31"},
86
+ "CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
87
+ "BAIL": {"mF1": "61"},
88
+ "LSI": {"mF1": "21.4"},
89
+ "PCR": {"muF1@K": "-"},
90
+ "SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
91
+ "L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
92
+ },
93
+ {
94
+ "Method": "GPT-4 (0-shot)",
95
+ "Submitted By": "IL-TUR",
96
+ "Github Link": "",
97
+ "L-NER": {"strict mF1": "13.65"},
98
+ "RR": {"mF1": "37.37"},
99
+ "CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
100
+ "BAIL": {"mF1": "51.46"},
101
+ "LSI": {"mF1": "23.99"},
102
+ "PCR": {"muF1@K": "-"},
103
+ "SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
104
+ "L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
105
+ },
106
+ {
107
+ "Method": "GPT-4 (1-shot)",
108
+ "Submitted By": "IL-TUR",
109
+ "Github Link": "",
110
+ "L-NER": {"strict mF1": "10.51"},
111
+ "RR": {"mF1": "37.43"},
112
+ "CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
113
+ "BAIL": {"mF1": "56.9"},
114
+ "LSI": {"mF1": "22.26"},
115
+ "PCR": {"muF1@K": "-"},
116
+ "SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
117
+ "L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
118
+ },
119
+ {
120
+ "Method": "GPT-4 (2-shot)",
121
+ "Submitted By": "IL-TUR",
122
+ "Github Link": "",
123
+ "L-NER": {"strict mF1": "24.03"},
124
+ "RR": {"mF1": "38.18"},
125
+ "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
126
+ "BAIL": {"mF1": "66.67"},
127
+ "LSI": {"mF1": "20.53"},
128
+ "PCR": {"muF1@K": "-"},
129
+ "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
130
+ "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
131
+ }
132
+ ]
133
+
submissions/baseline/submission.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Method": "GPT-5 (2-shot)",
4
+ "Submitted By": "IL-TUR",
5
+ "Github Link": "dummy submission",
6
+ "L-NER": {"strict mF1": "24.03"},
7
+ "RR": {"mF1": "38.18"},
8
+ "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
9
+ "BAIL": {"mF1": "66.67"},
10
+ "LSI": {"mF1": "20.53"},
11
+ "PCR": {"muF1@K": "-"},
12
+ "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
13
+ "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
14
+ }
15
+ ]
16
+
uploads.py CHANGED
@@ -2,6 +2,7 @@ from email.utils import parseaddr
2
  from huggingface_hub import HfApi
3
  import os
4
  import datetime
 
5
  import pandas as pd
6
 
7
  LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
@@ -59,54 +60,72 @@ def add_new_eval(
59
  mail,
60
  )
61
 
62
- # load the file
63
- df = pd.read_csv(path_to_file)
64
- submission_df = pd.read_csv(path_to_file)
65
 
66
- # modify the df to include metadata
67
- df["Method"] = method_name
68
- df["url"] = url
69
- df["organisation"] = organisation
70
- df["mail"] = parsed_mail
71
- df["timestamp"] = datetime.datetime.now()
72
 
73
- submission_df = pd.read_csv(path_to_file)
74
- submission_df["Method"] = method_name
75
- submission_df["Submitted By"] = organisation
76
- # upload to spaces using the hf api at
77
 
78
- path_in_repo = f"submissions/{method_name}"
79
- file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
80
 
81
  # upload the df to spaces
82
  import io
83
 
84
- buffer = io.BytesIO()
85
- df.to_csv(buffer, index=False) # Write the DataFrame to a buffer in CSV format
86
- buffer.seek(0) # Rewind the buffer to the beginning
87
 
88
- api.upload_file(
89
- repo_id=RESULTS_PATH,
90
- path_in_repo=f"{path_in_repo}/{file_name}",
91
- path_or_fileobj=buffer,
92
- token=TOKEN,
93
- repo_type="dataset",
94
- )
95
- # read the leaderboard
96
- leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
97
 
98
- # append the new submission_df csv to the leaderboard
99
- # leaderboard_df = leaderboard_df._append(submission_df)
100
- leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
101
 
102
- # save the new leaderboard
103
- # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
104
  leaderboard_buffer = io.BytesIO()
105
- leaderboard_df.to_csv(leaderboard_buffer, index=False)
 
 
 
 
106
  leaderboard_buffer.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  api.upload_file(
108
  repo_id=LEADERBOARD_PATH,
109
- path_in_repo=f"submissions/baseline/baseline.csv",
 
110
  path_or_fileobj=leaderboard_buffer,
111
  token=TOKEN,
112
  repo_type="space",
 
2
  from huggingface_hub import HfApi
3
  import os
4
  import datetime
5
+ import json
6
  import pandas as pd
7
 
8
  LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
 
60
  mail,
61
  )
62
 
63
+ # # load the file
64
+ # df = pd.read_csv(path_to_file)
65
+ # submission_df = pd.read_csv(path_to_file)
66
 
67
+ # # modify the df to include metadata
68
+ # df["Method"] = method_name
69
+ # df["url"] = url
70
+ # df["organisation"] = organisation
71
+ # df["mail"] = parsed_mail
72
+ # df["timestamp"] = datetime.datetime.now()
73
 
74
+ # submission_df = pd.read_csv(path_to_file)
75
+ # submission_df["Method"] = method_name
76
+ # submission_df["Submitted By"] = organisation
77
+ # # upload to spaces using the hf api at
78
 
79
+ # path_in_repo = f"submissions/{method_name}"
80
+ # file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
81
 
82
  # upload the df to spaces
83
  import io
84
 
85
+ # read the submission json file
86
+ with open(path_to_file, "r") as f:
87
+ submission = json.load(f)
88
 
89
+ with open("submissions/baseline/results.json", "r") as f:
90
+ results = json.load(f)
 
 
 
 
 
 
 
91
 
92
+ # update the results
93
+ results.append(submission[0])
 
94
 
 
 
95
  leaderboard_buffer = io.BytesIO()
96
+ # df.to_csv(buffer, index=False) # Write the DataFrame to a buffer in CSV format
97
+ # buffer.seek(0) # Rewind the buffer to the beginning
98
+
99
+ # save the results to buffer
100
+ leaderboard_buffer.write(json.dumps(results).encode())
101
  leaderboard_buffer.seek(0)
102
+
103
+ # api.upload_file(
104
+ # repo_id=RESULTS_PATH,
105
+ # path_in_repo=f"{path_in_repo}/{file_name}",
106
+ # path_or_fileobj=buffer,
107
+ # token=TOKEN,
108
+ # repo_type="dataset",
109
+ # )
110
+ # # read the leaderboard
111
+ # leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
112
+
113
+ # # append the new submission_df csv to the leaderboard
114
+ # # leaderboard_df = leaderboard_df._append(submission_df)
115
+ # # leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
116
+
117
+ # # save the new leaderboard
118
+ # # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
119
+ # leaderboard_buffer = io.BytesIO()
120
+ # leaderboard_df.to_csv(leaderboard_buffer, index=False)
121
+ # leaderboard_buffer.seek(0)
122
+ # with open("submissions/baseline/results.json", "w") as f:
123
+ # json.dump(results, f)
124
+
125
  api.upload_file(
126
  repo_id=LEADERBOARD_PATH,
127
+ # path_in_repo=f"submissions/baseline/baseline.csv",
128
+ path_in_repo=f"submissions/results.json",
129
  path_or_fileobj=leaderboard_buffer,
130
  token=TOKEN,
131
  repo_type="space",