Muennighoff commited on
Commit
817663f
1 Parent(s): 96fcd80

Add Polish Overall

Browse files
Files changed (1) hide show
  1. app.py +232 -17
app.py CHANGED
@@ -57,6 +57,16 @@ TASK_LIST_CLASSIFICATION_NB = [
57
  "ScalaNbClassification",
58
  ]
59
 
 
 
 
 
 
 
 
 
 
 
60
  TASK_LIST_CLASSIFICATION_SV = [
61
  "DalajClassification",
62
  "MassiveIntentClassification (sv)",
@@ -102,6 +112,10 @@ TASK_LIST_CLUSTERING_DE = [
102
  "TenKGnadClusteringS2S",
103
  ]
104
 
 
 
 
 
105
  TASK_LIST_CLUSTERING_ZH = [
106
  "CLSClusteringP2P",
107
  "CLSClusteringS2S",
@@ -115,6 +129,13 @@ TASK_LIST_PAIR_CLASSIFICATION = [
115
  "TwitterURLCorpus",
116
  ]
117
 
 
 
 
 
 
 
 
118
  TASK_LIST_PAIR_CLASSIFICATION_ZH = [
119
  "Cmnli",
120
  "Ocnli",
@@ -205,6 +226,12 @@ TASK_LIST_STS = [
205
  "STSBenchmark",
206
  ]
207
 
 
 
 
 
 
 
208
  TASK_LIST_STS_ZH = [
209
  "AFQMC",
210
  "ATEC",
@@ -222,6 +249,7 @@ TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TA
222
  TASK_LIST_SUMMARIZATION = ["SummEval",]
223
 
224
  TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
 
225
  TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
226
 
227
  TASK_TO_METRIC = {
@@ -298,6 +326,8 @@ EXTERNAL_MODELS = [
298
  "sentence-t5-xl",
299
  "sentence-t5-xxl",
300
  "sup-simcse-bert-base-uncased",
 
 
301
  "text2vec-base-chinese",
302
  "text2vec-large-chinese",
303
  "text-embedding-ada-002",
@@ -371,6 +401,8 @@ EXTERNAL_MODEL_TO_LINK = {
371
  "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
372
  "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
373
  "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
 
 
374
  "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
375
  "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
376
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
@@ -444,6 +476,8 @@ EXTERNAL_MODEL_TO_DIM = {
444
  "sentence-t5-xl": 768,
445
  "sentence-t5-xxl": 768,
446
  "sup-simcse-bert-base-uncased": 768,
 
 
447
  "text2vec-base-chinese": 768,
448
  "text2vec-large-chinese": 1024,
449
  "text-embedding-ada-002": 1536,
@@ -517,6 +551,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
517
  "sentence-t5-xl": 512,
518
  "sentence-t5-xxl": 512,
519
  "sup-simcse-bert-base-uncased": 512,
 
 
520
  "text2vec-base-chinese": 512,
521
  "text2vec-large-chinese": 512,
522
  "text-embedding-ada-002": 8191,
@@ -590,6 +626,8 @@ EXTERNAL_MODEL_TO_SIZE = {
590
  "sentence-t5-xl": 2.48,
591
  "sentence-t5-xxl": 9.73,
592
  "sup-simcse-bert-base-uncased": 0.44,
 
 
593
  "text2vec-base-chinese": 0.41,
594
  "text2vec-large-chinese": 1.30,
595
  "unsup-simcse-bert-base-uncased": 0.44,
@@ -621,6 +659,7 @@ MODELS_TO_SKIP = {
621
  "dmlls/all-mpnet-base-v2",
622
  "cgldo/semanticClone",
623
  "Malmuk1/e5-large-v2_Sharded",
 
624
  }
625
 
626
  EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
@@ -634,17 +673,17 @@ def add_lang(examples):
634
 
635
  def add_task(examples):
636
  # Could be added to the dataset loading script instead
637
- if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_ZH:
638
  examples["mteb_task"] = "Classification"
639
- elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_ZH:
640
  examples["mteb_task"] = "Clustering"
641
- elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_ZH:
642
  examples["mteb_task"] = "PairClassification"
643
  elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
644
  examples["mteb_task"] = "Reranking"
645
  elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
646
  examples["mteb_task"] = "Retrieval"
647
- elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_ZH:
648
  examples["mteb_task"] = "STS"
649
  elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
650
  examples["mteb_task"] = "Summarization"
@@ -915,7 +954,62 @@ def get_mteb_average_zh():
915
 
916
  return DATA_OVERALL_ZH
917
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
918
  get_mteb_average()
 
919
  get_mteb_average_zh()
920
  DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
921
  DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
@@ -924,7 +1018,6 @@ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIF
924
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
925
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
926
  DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
927
- DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
928
  DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
929
 
930
  # Exact, add all non-nan integer values for every dataset
@@ -938,19 +1031,24 @@ for d in [
938
  DATA_CLASSIFICATION_EN,
939
  DATA_CLASSIFICATION_DA,
940
  DATA_CLASSIFICATION_NB,
 
941
  DATA_CLASSIFICATION_SV,
942
  DATA_CLASSIFICATION_ZH,
943
  DATA_CLASSIFICATION_OTHER,
944
  DATA_CLUSTERING,
945
  DATA_CLUSTERING_DE,
 
946
  DATA_CLUSTERING_ZH,
947
  DATA_PAIR_CLASSIFICATION,
 
948
  DATA_PAIR_CLASSIFICATION_ZH,
949
  DATA_RERANKING,
950
  DATA_RERANKING_ZH,
951
  DATA_RETRIEVAL,
 
952
  DATA_RETRIEVAL_ZH,
953
  DATA_STS_EN,
 
954
  DATA_STS_ZH,
955
  DATA_STS_OTHER,
956
  DATA_SUMMARIZATION,
@@ -1017,6 +1115,25 @@ with block:
1017
  with gr.Row():
1018
  data_run_overall_zh = gr.Button("Refresh")
1019
  data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1020
  with gr.TabItem("Bitext Mining"):
1021
  with gr.TabItem("English-X"):
1022
  with gr.Row():
@@ -1184,7 +1301,36 @@ with block:
1184
  datasets_classification_nb,
1185
  ],
1186
  outputs=data_classification_nb,
1187
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  with gr.TabItem("Swedish"):
1189
  with gr.Row():
1190
  gr.Markdown("""
@@ -1316,7 +1462,32 @@ with block:
1316
  get_mteb_data,
1317
  inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
1318
  outputs=data_clustering_de,
1319
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1320
  with gr.TabItem("Pair Classification"):
1321
  with gr.TabItem("English"):
1322
  with gr.Row():
@@ -1375,6 +1546,35 @@ with block:
1375
  ],
1376
  outputs=data_pair_classification_zh,
1377
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1378
  with gr.TabItem("Reranking"):
1379
  with gr.TabItem("English"):
1380
  with gr.Row():
@@ -1561,6 +1761,31 @@ with block:
1561
  inputs=[task_sts_zh, lang_sts_zh, datasets_sts_zh],
1562
  outputs=data_sts_zh,
1563
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1564
  with gr.TabItem("Other"):
1565
  with gr.Row():
1566
  gr.Markdown("""
@@ -1627,16 +1852,6 @@ with block:
1627
  # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
1628
  """
1629
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
1630
- block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
1631
- block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
1632
- block.load(get_mteb_data, inputs=[task_clustering, empty, datasets_clustering], outputs=data_clustering)
1633
- block.load(get_mteb_data, inputs=[task_clustering_de, empty_de, datasets_clustering_de], outputs=data_clustering_de)
1634
- block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
1635
- block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
1636
- block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
1637
- block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
1638
- block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
1639
- block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
1640
  """
1641
 
1642
  block.queue(concurrency_count=40, max_size=10)
 
57
  "ScalaNbClassification",
58
  ]
59
 
60
+ TASK_LIST_CLASSIFICATION_PL = [
61
+ "AbusiveClauses",
62
+ "AllegroReviews",
63
+ "CBD",
64
+ "MassiveIntentClassification (pl)",
65
+ "MassiveScenarioClassification (pl)",
66
+ "PolEmo2.0-IN",
67
+ "PolEmo2.0-OUT",
68
+ ]
69
+
70
  TASK_LIST_CLASSIFICATION_SV = [
71
  "DalajClassification",
72
  "MassiveIntentClassification (sv)",
 
112
  "TenKGnadClusteringS2S",
113
  ]
114
 
115
+ TASK_LIST_CLUSTERING_PL = [
116
+ "8TagsClustering",
117
+ ]
118
+
119
  TASK_LIST_CLUSTERING_ZH = [
120
  "CLSClusteringP2P",
121
  "CLSClusteringS2S",
 
129
  "TwitterURLCorpus",
130
  ]
131
 
132
+ TASK_LIST_PAIR_CLASSIFICATION_PL = [
133
+ "CDSC-E",
134
+ "PPC",
135
+ "PSC",
136
+ "SICK-E-PL",
137
+ ]
138
+
139
  TASK_LIST_PAIR_CLASSIFICATION_ZH = [
140
  "Cmnli",
141
  "Ocnli",
 
226
  "STSBenchmark",
227
  ]
228
 
229
+ TASK_LIST_STS_PL = [
230
+ "CDSC-R",
231
+ "SICK-R-PL",
232
+ "STS22 (pl)",
233
+ ]
234
+
235
  TASK_LIST_STS_ZH = [
236
  "AFQMC",
237
  "ATEC",
 
249
  TASK_LIST_SUMMARIZATION = ["SummEval",]
250
 
251
  TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
252
+ TASK_LIST_PL = TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL
253
  TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
254
 
255
  TASK_TO_METRIC = {
 
326
  "sentence-t5-xl",
327
  "sentence-t5-xxl",
328
  "sup-simcse-bert-base-uncased",
329
+ "st-polish-paraphrase-from-distilroberta",
330
+ "st-polish-paraphrase-from-mpnet",
331
  "text2vec-base-chinese",
332
  "text2vec-large-chinese",
333
  "text-embedding-ada-002",
 
401
  "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
402
  "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
403
  "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
404
+ "st-polish-paraphrase-from-distilroberta": "https://huggingface.co/sdadas/st-polish-paraphrase-from-distilroberta",
405
+ "st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
406
  "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
407
  "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
408
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
 
476
  "sentence-t5-xl": 768,
477
  "sentence-t5-xxl": 768,
478
  "sup-simcse-bert-base-uncased": 768,
479
+ "st-polish-paraphrase-from-distilroberta": 768,
480
+ "st-polish-paraphrase-from-mpnet": 768,
481
  "text2vec-base-chinese": 768,
482
  "text2vec-large-chinese": 1024,
483
  "text-embedding-ada-002": 1536,
 
551
  "sentence-t5-xl": 512,
552
  "sentence-t5-xxl": 512,
553
  "sup-simcse-bert-base-uncased": 512,
554
+ "st-polish-paraphrase-from-distilroberta": 514,
555
+ "st-polish-paraphrase-from-mpnet": 514,
556
  "text2vec-base-chinese": 512,
557
  "text2vec-large-chinese": 512,
558
  "text-embedding-ada-002": 8191,
 
626
  "sentence-t5-xl": 2.48,
627
  "sentence-t5-xxl": 9.73,
628
  "sup-simcse-bert-base-uncased": 0.44,
629
+ "st-polish-paraphrase-from-distilroberta": 0.50,
630
+ "st-polish-paraphrase-from-mpnet": 0.50,
631
  "text2vec-base-chinese": 0.41,
632
  "text2vec-large-chinese": 1.30,
633
  "unsup-simcse-bert-base-uncased": 0.44,
 
659
  "dmlls/all-mpnet-base-v2",
660
  "cgldo/semanticClone",
661
  "Malmuk1/e5-large-v2_Sharded",
662
+ "jncraton/gte-small-ct2-int8",
663
  }
664
 
665
  EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
 
673
 
674
  def add_task(examples):
675
  # Could be added to the dataset loading script instead
676
+ if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_ZH:
677
  examples["mteb_task"] = "Classification"
678
+ elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_PL + TASK_LIST_CLUSTERING_ZH:
679
  examples["mteb_task"] = "Clustering"
680
+ elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_PAIR_CLASSIFICATION_ZH:
681
  examples["mteb_task"] = "PairClassification"
682
  elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
683
  examples["mteb_task"] = "Reranking"
684
  elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
685
  examples["mteb_task"] = "Retrieval"
686
+ elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_PL + TASK_LIST_STS_ZH:
687
  examples["mteb_task"] = "STS"
688
  elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
689
  examples["mteb_task"] = "Summarization"
 
954
 
955
  return DATA_OVERALL_ZH
956
 
957
+ def get_mteb_average_pl():
958
+ global DATA_OVERALL_PL, DATA_CLASSIFICATION_PL, DATA_CLUSTERING_PL, DATA_PAIR_CLASSIFICATION_PL, DATA_RETRIEVAL_PL, DATA_STS_PL
959
+ DATA_OVERALL_PL = get_mteb_data(
960
+ tasks=[
961
+ "Classification",
962
+ "Clustering",
963
+ "PairClassification",
964
+ "Retrieval",
965
+ "STS",
966
+ ],
967
+ datasets=TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL,
968
+ fillna=False,
969
+ add_emb_dim=True,
970
+ rank=False,
971
+ )
972
+ # Debugging:
973
+ # DATA_OVERALL_PL.to_csv("overall.csv")
974
+
975
+ DATA_OVERALL_PL.insert(1, f"Average ({len(TASK_LIST_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PL].mean(axis=1, skipna=False))
976
+ DATA_OVERALL_PL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLASSIFICATION_PL].mean(axis=1, skipna=False))
977
+ DATA_OVERALL_PL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLUSTERING_PL].mean(axis=1, skipna=False))
978
+ DATA_OVERALL_PL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PAIR_CLASSIFICATION_PL].mean(axis=1, skipna=False))
979
+ DATA_OVERALL_PL.insert(5, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_RETRIEVAL_PL].mean(axis=1, skipna=False))
980
+ DATA_OVERALL_PL.insert(6, f"STS Average ({len(TASK_LIST_STS_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_STS_PL].mean(axis=1, skipna=False))
981
+ DATA_OVERALL_PL.sort_values(f"Average ({len(TASK_LIST_PL)} datasets)", ascending=False, inplace=True)
982
+ # Start ranking from 1
983
+ DATA_OVERALL_PL.insert(0, "Rank", list(range(1, len(DATA_OVERALL_PL) + 1)))
984
+
985
+ DATA_OVERALL_PL = DATA_OVERALL_PL.round(2)
986
+
987
+ DATA_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLASSIFICATION_PL])
988
+ # Only keep rows with at least one score in addition to the "Model" & rank column
989
+ DATA_CLASSIFICATION_PL = DATA_CLASSIFICATION_PL[DATA_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
990
+
991
+ DATA_CLUSTERING_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLUSTERING_PL])
992
+ DATA_CLUSTERING_PL = DATA_CLUSTERING_PL[DATA_CLUSTERING_PL.iloc[:, 2:].ne("").any(axis=1)]
993
+
994
+ DATA_PAIR_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_PL])
995
+ DATA_PAIR_CLASSIFICATION_PL = DATA_PAIR_CLASSIFICATION_PL[DATA_PAIR_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
996
+
997
+ DATA_RETRIEVAL_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_RETRIEVAL_PL])
998
+ DATA_RETRIEVAL_PL = DATA_RETRIEVAL_PL[DATA_RETRIEVAL_PL.iloc[:, 2:].ne("").any(axis=1)]
999
+
1000
+ DATA_STS_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_STS_PL])
1001
+ DATA_STS_PL = DATA_STS_PL[DATA_STS_PL.iloc[:, 2:].ne("").any(axis=1)]
1002
+
1003
+ # Fill NaN after averaging
1004
+ DATA_OVERALL_PL.fillna("", inplace=True)
1005
+
1006
+ DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
1007
+ DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
1008
+
1009
+ return DATA_OVERALL_PL
1010
+
1011
  get_mteb_average()
1012
+ get_mteb_average_pl()
1013
  get_mteb_average_zh()
1014
  DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
1015
  DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
 
1018
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
1019
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
1020
  DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
 
1021
  DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
1022
 
1023
  # Exact, add all non-nan integer values for every dataset
 
1031
  DATA_CLASSIFICATION_EN,
1032
  DATA_CLASSIFICATION_DA,
1033
  DATA_CLASSIFICATION_NB,
1034
+ DATA_CLASSIFICATION_PL,
1035
  DATA_CLASSIFICATION_SV,
1036
  DATA_CLASSIFICATION_ZH,
1037
  DATA_CLASSIFICATION_OTHER,
1038
  DATA_CLUSTERING,
1039
  DATA_CLUSTERING_DE,
1040
+ DATA_CLUSTERING_PL,
1041
  DATA_CLUSTERING_ZH,
1042
  DATA_PAIR_CLASSIFICATION,
1043
+ DATA_PAIR_CLASSIFICATION_PL,
1044
  DATA_PAIR_CLASSIFICATION_ZH,
1045
  DATA_RERANKING,
1046
  DATA_RERANKING_ZH,
1047
  DATA_RETRIEVAL,
1048
+ DATA_RETRIEVAL_PL,
1049
  DATA_RETRIEVAL_ZH,
1050
  DATA_STS_EN,
1051
+ DATA_STS_PL,
1052
  DATA_STS_ZH,
1053
  DATA_STS_OTHER,
1054
  DATA_SUMMARIZATION,
 
1115
  with gr.Row():
1116
  data_run_overall_zh = gr.Button("Refresh")
1117
  data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
1118
+ with gr.TabItem("Polish"):
1119
+ with gr.Row():
1120
+ gr.Markdown("""
1121
+ **Overall MTEB Polish leaderboard (PL-MTEB) 🔮🇵🇱**
1122
+
1123
+ - **Metric:** Various, refer to task tabs
1124
+ - **Languages:** Polish
1125
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata), [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
1126
+ """)
1127
+ with gr.Row():
1128
+ data_overall_pl = gr.components.Dataframe(
1129
+ DATA_OVERALL_PL,
1130
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
1131
+ type="pandas",
1132
+ wrap=True,
1133
+ )
1134
+ with gr.Row():
1135
+ data_run_overall_pl = gr.Button("Refresh")
1136
+ data_run_overall_pl.click(get_mteb_average_pl, inputs=None, outputs=data_overall_pl)
1137
  with gr.TabItem("Bitext Mining"):
1138
  with gr.TabItem("English-X"):
1139
  with gr.Row():
 
1301
  datasets_classification_nb,
1302
  ],
1303
  outputs=data_classification_nb,
1304
+ )
1305
+ with gr.TabItem("Polish"):
1306
+ with gr.Row():
1307
+ gr.Markdown("""
1308
+ **Classification Polish Leaderboard 🤍🇵🇱**
1309
+
1310
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1311
+ - **Languages:** Polish
1312
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1313
+ """)
1314
+ with gr.Row():
1315
+ data_classification_pl = gr.components.Dataframe(
1316
+ DATA_CLASSIFICATION_PL,
1317
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_PL.columns),
1318
+ type="pandas",
1319
+ )
1320
+ with gr.Row():
1321
+ data_run_classification_pl = gr.Button("Refresh")
1322
+ task_classification_pl = gr.Variable(value=["Classification"])
1323
+ lang_classification_pl = gr.Variable(value=[])
1324
+ datasets_classification_pl = gr.Variable(value=TASK_LIST_CLASSIFICATION_PL)
1325
+ data_run_classification_pl.click(
1326
+ get_mteb_data,
1327
+ inputs=[
1328
+ task_classification_pl,
1329
+ lang_classification_pl,
1330
+ datasets_classification_pl,
1331
+ ],
1332
+ outputs=data_classification_pl,
1333
+ )
1334
  with gr.TabItem("Swedish"):
1335
  with gr.Row():
1336
  gr.Markdown("""
 
1462
  get_mteb_data,
1463
  inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
1464
  outputs=data_clustering_de,
1465
+ )
1466
+ with gr.TabItem("Polish"):
1467
+ with gr.Row():
1468
+ gr.Markdown("""
1469
+ **Clustering Polish Leaderboard ✨🇵🇱**
1470
+
1471
+ - **Metric:** Validity Measure (v_measure)
1472
+ - **Languages:** Polish
1473
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1474
+ """)
1475
+ with gr.Row():
1476
+ data_clustering_pl = gr.components.Dataframe(
1477
+ DATA_CLUSTERING_PL,
1478
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_PL.columns) * 2,
1479
+ type="pandas",
1480
+ )
1481
+ with gr.Row():
1482
+ data_run_clustering_pl = gr.Button("Refresh")
1483
+ task_clustering_pl = gr.Variable(value=["Clustering"])
1484
+ lang_clustering_pl = gr.Variable(value=[])
1485
+ datasets_clustering_pl = gr.Variable(value=TASK_LIST_CLUSTERING_PL)
1486
+ data_run_clustering_pl.click(
1487
+ get_mteb_data,
1488
+ inputs=[task_clustering_pl, lang_clustering_pl, datasets_clustering_pl],
1489
+ outputs=data_clustering_pl,
1490
+ )
1491
  with gr.TabItem("Pair Classification"):
1492
  with gr.TabItem("English"):
1493
  with gr.Row():
 
1546
  ],
1547
  outputs=data_pair_classification_zh,
1548
  )
1549
+ with gr.TabItem("Polish"):
1550
+ with gr.Row():
1551
+ gr.Markdown("""
1552
+ **Pair Classification Chinese Leaderboard 🎭🇵🇱**
1553
+
1554
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1555
+ - **Languages:** Polish
1556
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1557
+ """)
1558
+ with gr.Row():
1559
+ data_pair_classification_pl = gr.components.Dataframe(
1560
+ DATA_PAIR_CLASSIFICATION_PL,
1561
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_PL.columns),
1562
+ type="pandas",
1563
+ )
1564
+ with gr.Row():
1565
+ data_run = gr.Button("Refresh")
1566
+ task_pair_classification_pl = gr.Variable(value=["PairClassification"])
1567
+ lang_pair_classification_pl = gr.Variable(value=[])
1568
+ datasets_pair_classification_pl = gr.Variable(value=TASK_LIST_PAIR_CLASSIFICATION_PL)
1569
+ data_run_classification_pl.click(
1570
+ get_mteb_data,
1571
+ inputs=[
1572
+ task_pair_classification_pl,
1573
+ lang_pair_classification_pl,
1574
+ datasets_pair_classification_pl,
1575
+ ],
1576
+ outputs=data_pair_classification_pl,
1577
+ )
1578
  with gr.TabItem("Reranking"):
1579
  with gr.TabItem("English"):
1580
  with gr.Row():
 
1761
  inputs=[task_sts_zh, lang_sts_zh, datasets_sts_zh],
1762
  outputs=data_sts_zh,
1763
  )
1764
+ with gr.TabItem("Polish"):
1765
+ with gr.Row():
1766
+ gr.Markdown("""
1767
+ **STS Polish Leaderboard 🤖🇵🇱**
1768
+
1769
+ - **Metric:** Spearman correlation based on cosine similarity
1770
+ - **Languages:** Polish
1771
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1772
+ """)
1773
+ with gr.Row():
1774
+ data_sts_pl = gr.components.Dataframe(
1775
+ DATA_STS_PL,
1776
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_PL.columns),
1777
+ type="pandas",
1778
+ )
1779
+ with gr.Row():
1780
+ data_run_sts_pl = gr.Button("Refresh")
1781
+ task_sts_pl = gr.Variable(value=["STS"])
1782
+ lang_sts_pl = gr.Variable(value=[])
1783
+ datasets_sts_pl = gr.Variable(value=TASK_LIST_STS_PL)
1784
+ data_run_sts_pl.click(
1785
+ get_mteb_data,
1786
+ inputs=[task_sts_pl, lang_sts_pl, datasets_sts_pl],
1787
+ outputs=data_sts_pl,
1788
+ )
1789
  with gr.TabItem("Other"):
1790
  with gr.Row():
1791
  gr.Markdown("""
 
1852
  # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
1853
  """
1854
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
 
 
 
 
 
 
 
 
 
 
1855
  """
1856
 
1857
  block.queue(concurrency_count=40, max_size=10)