Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
KennethEnevoldsen
commited on
Merge pull request #33 from Samoed/fix_leaderboard2
Browse files- EXTERNAL_MODEL_RESULTS.json +0 -0
- config.yaml +42 -42
- refresh.py +11 -7
EXTERNAL_MODEL_RESULTS.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
config.yaml
CHANGED
@@ -23,7 +23,7 @@ tasks:
|
|
23 |
metric: max_ap
|
24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
-
Reranking:
|
27 |
icon: "🥈"
|
28 |
metric: map
|
29 |
metric_description: "Mean Average Precision (MAP)"
|
@@ -345,35 +345,35 @@ boards:
|
|
345 |
credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
|
346 |
tasks:
|
347 |
Classification:
|
348 |
-
- GeoreviewClassification
|
349 |
-
- HeadlineClassification
|
350 |
-
- InappropriatenessClassification
|
351 |
-
- KinopoiskClassification
|
352 |
-
- RuReviewsClassification
|
353 |
-
- RuSciBenchGRNTIClassification
|
354 |
-
- RuSciBenchOECDClassification
|
355 |
-
- MassiveIntentClassification (
|
356 |
-
- MassiveScenarioClassification (
|
357 |
Clustering:
|
358 |
-
- GeoreviewClusteringP2P
|
359 |
-
- RuSciBenchGRNTIClusteringP2P
|
360 |
-
- RuSciBenchOECDClusteringP2P
|
361 |
PairClassification:
|
362 |
-
- TERRa
|
363 |
Reranking:
|
364 |
-
- RuBQReranking
|
365 |
-
- MIRACLReranking (
|
366 |
Retrieval:
|
367 |
-
- RiaNewsRetrieval
|
368 |
-
- RuBQRetrieval
|
369 |
-
- MIRACLRetrieval (
|
370 |
STS:
|
371 |
-
- RUParaPhraserSTS
|
372 |
-
- RuSTSBenchmarkSTS
|
373 |
-
- STS22 (
|
374 |
MultilabelClassification:
|
375 |
-
- CEDRClassification
|
376 |
-
- SensitiveTopicsClassification
|
377 |
se:
|
378 |
title: Swedish
|
379 |
language_long: Swedish
|
@@ -530,23 +530,23 @@ boards:
|
|
530 |
metric: nDCG@10
|
531 |
tasks:
|
532 |
Retrieval:
|
533 |
-
- AppsRetrieval
|
534 |
-
- CodeFeedbackMT
|
535 |
-
- CodeFeedbackST
|
536 |
-
- CodeSearchNetCCRetrieval (python
|
537 |
-
- CodeSearchNetCCRetrieval (javascript
|
538 |
-
- CodeSearchNetCCRetrieval (go
|
539 |
-
- CodeSearchNetCCRetrieval (ruby
|
540 |
-
- CodeSearchNetCCRetrieval (java
|
541 |
-
- CodeSearchNetCCRetrieval (php
|
542 |
-
- CodeSearchNetRetrieval (python
|
543 |
-
- CodeSearchNetRetrieval (javascript
|
544 |
-
- CodeSearchNetRetrieval (go
|
545 |
-
- CodeSearchNetRetrieval (ruby
|
546 |
-
- CodeSearchNetRetrieval (java
|
547 |
-
- CodeSearchNetRetrieval (php
|
548 |
-
- CodeTransOceanContest
|
549 |
- CodeTransOceanDL
|
550 |
-
- CosQA
|
551 |
- StackOverflowQA
|
552 |
-
- SyntheticText2SQL
|
|
|
23 |
metric: max_ap
|
24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
+
Reranking:
|
27 |
icon: "🥈"
|
28 |
metric: map
|
29 |
metric_description: "Mean Average Precision (MAP)"
|
|
|
345 |
credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
|
346 |
tasks:
|
347 |
Classification:
|
348 |
+
- GeoreviewClassification
|
349 |
+
- HeadlineClassification
|
350 |
+
- InappropriatenessClassification
|
351 |
+
- KinopoiskClassification
|
352 |
+
- RuReviewsClassification
|
353 |
+
- RuSciBenchGRNTIClassification
|
354 |
+
- RuSciBenchOECDClassification
|
355 |
+
- MassiveIntentClassification (ru)
|
356 |
+
- MassiveScenarioClassification (ru)
|
357 |
Clustering:
|
358 |
+
- GeoreviewClusteringP2P
|
359 |
+
- RuSciBenchGRNTIClusteringP2P
|
360 |
+
- RuSciBenchOECDClusteringP2P
|
361 |
PairClassification:
|
362 |
+
- TERRa
|
363 |
Reranking:
|
364 |
+
- RuBQReranking
|
365 |
+
- MIRACLReranking (ru)
|
366 |
Retrieval:
|
367 |
+
- RiaNewsRetrieval
|
368 |
+
- RuBQRetrieval
|
369 |
+
- MIRACLRetrieval (ru)
|
370 |
STS:
|
371 |
+
- RUParaPhraserSTS
|
372 |
+
- RuSTSBenchmarkSTS
|
373 |
+
- STS22 (ru)
|
374 |
MultilabelClassification:
|
375 |
+
- CEDRClassification
|
376 |
+
- SensitiveTopicsClassification
|
377 |
se:
|
378 |
title: Swedish
|
379 |
language_long: Swedish
|
|
|
530 |
metric: nDCG@10
|
531 |
tasks:
|
532 |
Retrieval:
|
533 |
+
- AppsRetrieval
|
534 |
+
- CodeFeedbackMT
|
535 |
+
- CodeFeedbackST
|
536 |
+
- CodeSearchNetCCRetrieval (python)
|
537 |
+
- CodeSearchNetCCRetrieval (javascript)
|
538 |
+
- CodeSearchNetCCRetrieval (go)
|
539 |
+
- CodeSearchNetCCRetrieval (ruby)
|
540 |
+
- CodeSearchNetCCRetrieval (java)
|
541 |
+
- CodeSearchNetCCRetrieval (php)
|
542 |
+
- CodeSearchNetRetrieval (python)
|
543 |
+
- CodeSearchNetRetrieval (javascript)
|
544 |
+
- CodeSearchNetRetrieval (go)
|
545 |
+
- CodeSearchNetRetrieval (ruby)
|
546 |
+
- CodeSearchNetRetrieval (java)
|
547 |
+
- CodeSearchNetRetrieval (php)
|
548 |
+
- CodeTransOceanContest
|
549 |
- CodeTransOceanDL
|
550 |
+
- CosQA
|
551 |
- StackOverflowQA
|
552 |
+
- SyntheticText2SQL
|
refresh.py
CHANGED
@@ -131,12 +131,12 @@ def make_clickable_model(model_name: str, link: None | str = None) -> str:
|
|
131 |
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
|
132 |
|
133 |
|
134 |
-
def
|
135 |
-
if not (examples["
|
136 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
137 |
else:
|
138 |
examples["mteb_dataset_name_with_lang"] = (
|
139 |
-
examples["mteb_dataset_name"] + f' ({examples["
|
140 |
)
|
141 |
return examples
|
142 |
|
@@ -265,7 +265,7 @@ def get_external_model_results():
|
|
265 |
print(f"Can't fined model {model} in results repository. Exception: {e}")
|
266 |
continue
|
267 |
|
268 |
-
ds = ds.map(
|
269 |
ds = ds.map(add_task)
|
270 |
base_dict = {
|
271 |
"Model": make_clickable_model(
|
@@ -313,7 +313,7 @@ def get_external_model_results():
|
|
313 |
|
314 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
315 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
316 |
-
json.dump(EXTERNAL_MODEL_RESULTS, f, indent=4)
|
317 |
|
318 |
return EXTERNAL_MODEL_RESULTS
|
319 |
|
@@ -332,6 +332,10 @@ def download_or_use_cache(modelId: str):
|
|
332 |
return meta
|
333 |
|
334 |
|
|
|
|
|
|
|
|
|
335 |
def get_mteb_data(
|
336 |
tasks: list = ["Clustering"],
|
337 |
langs: list = [],
|
@@ -450,11 +454,11 @@ def get_mteb_data(
|
|
450 |
try:
|
451 |
out = [
|
452 |
{
|
453 |
-
res["dataset"]["name"]
|
454 |
round(score["value"], 2)
|
455 |
for score in res["metrics"]
|
456 |
if filter_metric_fetched(
|
457 |
-
res["dataset"]["name"]
|
458 |
score["type"],
|
459 |
task_to_metric.get(res["task"]["type"]),
|
460 |
res["dataset"]["split"],
|
|
|
131 |
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
|
132 |
|
133 |
|
134 |
+
def add_subset(examples):
|
135 |
+
if not (examples["hf_subset"]) or (examples["hf_subset"] == "default"):
|
136 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
137 |
else:
|
138 |
examples["mteb_dataset_name_with_lang"] = (
|
139 |
+
examples["mteb_dataset_name"] + f' ({examples["hf_subset"]})'
|
140 |
)
|
141 |
return examples
|
142 |
|
|
|
265 |
print(f"Can't fined model {model} in results repository. Exception: {e}")
|
266 |
continue
|
267 |
|
268 |
+
ds = ds.map(add_subset)
|
269 |
ds = ds.map(add_task)
|
270 |
base_dict = {
|
271 |
"Model": make_clickable_model(
|
|
|
313 |
|
314 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
315 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
316 |
+
json.dump(dict(sorted(EXTERNAL_MODEL_RESULTS.items())), f, indent=4)
|
317 |
|
318 |
return EXTERNAL_MODEL_RESULTS
|
319 |
|
|
|
332 |
return meta
|
333 |
|
334 |
|
335 |
+
def simplify_dataset_name(name):
|
336 |
+
return name.replace("MTEB ", "").replace(" (default)", "")
|
337 |
+
|
338 |
+
|
339 |
def get_mteb_data(
|
340 |
tasks: list = ["Clustering"],
|
341 |
langs: list = [],
|
|
|
454 |
try:
|
455 |
out = [
|
456 |
{
|
457 |
+
simplify_dataset_name(res["dataset"]["name"]): [
|
458 |
round(score["value"], 2)
|
459 |
for score in res["metrics"]
|
460 |
if filter_metric_fetched(
|
461 |
+
simplify_dataset_name(res["dataset"]["name"]),
|
462 |
score["type"],
|
463 |
task_to_metric.get(res["task"]["type"]),
|
464 |
res["dataset"]["split"],
|