Spaces:
Running
Running
Upload from GitHub Actions: Evaluate on autotranslated GSM dataset
Browse files- datasets.json +16 -1
- evals/datasets_/mgsm.py +56 -1
- evals/main.py +1 -2
- evals/models.py +11 -11
- evals/tasks.py +17 -2
- evals/translate.py +5 -0
- results.json +0 -0
datasets.json
CHANGED
|
@@ -300,7 +300,22 @@
|
|
| 300 |
"parallel": true,
|
| 301 |
"translation": "machine",
|
| 302 |
"base": "MGSM",
|
| 303 |
-
"implemented":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
"group": "Grade School Math"
|
| 305 |
},
|
| 306 |
{
|
|
|
|
| 300 |
"parallel": true,
|
| 301 |
"translation": "machine",
|
| 302 |
"base": "MGSM",
|
| 303 |
+
"implemented": false,
|
| 304 |
+
"group": "Grade School Math"
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"name": "GSM Auto-Translated",
|
| 308 |
+
"author": null,
|
| 309 |
+
"author_url": null,
|
| 310 |
+
"url": null,
|
| 311 |
+
"n_languages": 52,
|
| 312 |
+
"tasks": [
|
| 313 |
+
"math"
|
| 314 |
+
],
|
| 315 |
+
"parallel": true,
|
| 316 |
+
"translation": "machine",
|
| 317 |
+
"base": "MGSM",
|
| 318 |
+
"implemented": false,
|
| 319 |
"group": "Grade School Math"
|
| 320 |
},
|
| 321 |
{
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 2 |
-
from langcodes import
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
slug_mgsm = "juletxara/mgsm"
|
| 5 |
tags_mgsm = {
|
|
@@ -14,6 +21,12 @@ tags_gsm8kx = {
|
|
| 14 |
standardize_tag(a, macro=True): a
|
| 15 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
| 16 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def parse_number(i):
|
| 19 |
if isinstance(i, int):
|
|
@@ -23,6 +36,7 @@ def parse_number(i):
|
|
| 23 |
except ValueError:
|
| 24 |
return None
|
| 25 |
|
|
|
|
| 26 |
def load_mgsm(language_bcp_47, nr):
|
| 27 |
if language_bcp_47 in tags_mgsm.keys():
|
| 28 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
|
@@ -32,6 +46,11 @@ def load_mgsm(language_bcp_47, nr):
|
|
| 32 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 33 |
)
|
| 34 |
return slug_afrimgsm, ds[nr]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 36 |
row = _load_dataset(
|
| 37 |
slug_gsm8kx,
|
|
@@ -43,3 +62,39 @@ def load_mgsm(language_bcp_47, nr):
|
|
| 43 |
return slug_gsm8kx, row
|
| 44 |
else:
|
| 45 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from datasets import Dataset, load_dataset
|
| 5 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 6 |
+
from langcodes import standardize_tag
|
| 7 |
+
from models import google_supported_languages, translate_google
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
|
| 11 |
slug_mgsm = "juletxara/mgsm"
|
| 12 |
tags_mgsm = {
|
|
|
|
| 21 |
standardize_tag(a, macro=True): a
|
| 22 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
| 23 |
}
|
| 24 |
+
slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
|
| 25 |
+
tags_gsm_autotranslated = {
|
| 26 |
+
standardize_tag(a, macro=True): a
|
| 27 |
+
for a in _get_dataset_config_names(slug_gsm_autotranslated)
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
|
| 31 |
def parse_number(i):
|
| 32 |
if isinstance(i, int):
|
|
|
|
| 36 |
except ValueError:
|
| 37 |
return None
|
| 38 |
|
| 39 |
+
|
| 40 |
def load_mgsm(language_bcp_47, nr):
|
| 41 |
if language_bcp_47 in tags_mgsm.keys():
|
| 42 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
|
|
|
| 46 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 47 |
)
|
| 48 |
return slug_afrimgsm, ds[nr]
|
| 49 |
+
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 50 |
+
ds = _load_dataset(
|
| 51 |
+
slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
|
| 52 |
+
)
|
| 53 |
+
return slug_gsm_autotranslated, ds[nr]
|
| 54 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 55 |
row = _load_dataset(
|
| 56 |
slug_gsm8kx,
|
|
|
|
| 62 |
return slug_gsm8kx, row
|
| 63 |
else:
|
| 64 |
return None, None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def translate_mgsm(languages):
|
| 68 |
+
human_translated = [*tags_mgsm.keys(), *tags_afrimgsm.keys()]
|
| 69 |
+
untranslated = [
|
| 70 |
+
lang
|
| 71 |
+
for lang in languages["bcp_47"].values[:100]
|
| 72 |
+
if lang not in human_translated and lang in google_supported_languages
|
| 73 |
+
]
|
| 74 |
+
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 75 |
+
slug = "fair-forward/gsm-autotranslated"
|
| 76 |
+
for lang in tqdm(untranslated):
|
| 77 |
+
# check if already exists on hub
|
| 78 |
+
try:
|
| 79 |
+
ds_lang = load_dataset(slug, lang, split="test")
|
| 80 |
+
except ValueError:
|
| 81 |
+
print(f"Translating {lang}...")
|
| 82 |
+
questions_tr = [translate_google(q, "en", lang) for q in en["question"]]
|
| 83 |
+
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 84 |
+
ds_lang = Dataset.from_dict(
|
| 85 |
+
{
|
| 86 |
+
"question": questions_tr,
|
| 87 |
+
"answer": en["answer"],
|
| 88 |
+
"answer_number": en["answer_number"],
|
| 89 |
+
"equation_solution": en["equation_solution"],
|
| 90 |
+
}
|
| 91 |
+
)
|
| 92 |
+
ds_lang.push_to_hub(
|
| 93 |
+
slug,
|
| 94 |
+
split="test",
|
| 95 |
+
config_name=lang,
|
| 96 |
+
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 97 |
+
)
|
| 98 |
+
ds_lang.to_json(
|
| 99 |
+
f"data/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
|
| 100 |
+
)
|
evals/main.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
from time import time
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
from languages import languages
|
|
@@ -16,7 +15,7 @@ n_sentences = 10
|
|
| 16 |
|
| 17 |
async def evaluate():
|
| 18 |
# FIXME we should not need this for-loop, but it helps
|
| 19 |
-
for n_languages in range(
|
| 20 |
print(f"running evaluations for {n_languages} languages")
|
| 21 |
old_results = pd.read_json("results.json")
|
| 22 |
old_models = pd.read_json("models.json")
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
from languages import languages
|
|
|
|
| 15 |
|
| 16 |
async def evaluate():
|
| 17 |
# FIXME we should not need this for-loop, but it helps
|
| 18 |
+
for n_languages in range(90, 101, 3):
|
| 19 |
print(f"running evaluations for {n_languages} languages")
|
| 20 |
old_results = pd.read_json("results.json")
|
| 21 |
old_models = pd.read_json("models.json")
|
evals/models.py
CHANGED
|
@@ -34,7 +34,7 @@ important_models = [
|
|
| 34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 35 |
"mistralai/mistral-saba", # 0.6$
|
| 36 |
"mistralai/mistral-nemo", # 0.08$
|
| 37 |
-
"google/gemini-2.5-flash
|
| 38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
| 39 |
"google/gemma-3-27b-it", # 0.2$
|
| 40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
|
@@ -50,8 +50,14 @@ important_models = [
|
|
| 50 |
|
| 51 |
blocklist = [
|
| 52 |
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
| 53 |
-
"google/gemini-2.5-pro",
|
| 54 |
-
"google/gemini-2.5-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
]
|
| 56 |
|
| 57 |
transcription_models = [
|
|
@@ -153,17 +159,11 @@ async def complete(**kwargs) -> str | None:
|
|
| 153 |
|
| 154 |
|
| 155 |
translate_client = translate.Client()
|
| 156 |
-
|
| 157 |
|
| 158 |
|
| 159 |
@cache
|
| 160 |
async def translate_google(text, source_language, target_language):
|
| 161 |
-
source_language = closest_supported_match(source_language, supported_languages)
|
| 162 |
-
target_language = closest_supported_match(target_language, supported_languages)
|
| 163 |
-
if source_language == target_language:
|
| 164 |
-
return text
|
| 165 |
-
if source_language is None or target_language is None:
|
| 166 |
-
return None
|
| 167 |
async with google_rate_limit:
|
| 168 |
response = translate_client.translate(
|
| 169 |
text, source_language=source_language, target_language=target_language
|
|
@@ -284,7 +284,7 @@ def load_models(date: date):
|
|
| 284 |
["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
|
| 285 |
] * len(models)
|
| 286 |
models = pd.concat([models, get_translation_models()])
|
| 287 |
-
models = models[
|
| 288 |
(models["id"] != "google/gemini-2.5-pro")
|
| 289 |
& (models["id"] != "google/gemini-2.5-pro-preview")
|
| 290 |
]
|
|
|
|
| 34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 35 |
"mistralai/mistral-saba", # 0.6$
|
| 36 |
"mistralai/mistral-nemo", # 0.08$
|
| 37 |
+
"google/gemini-2.5-flash", # 0.6$
|
| 38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
| 39 |
"google/gemma-3-27b-it", # 0.2$
|
| 40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
|
|
|
| 50 |
|
| 51 |
blocklist = [
|
| 52 |
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
| 53 |
+
"google/gemini-2.5-pro-preview",
|
| 54 |
+
"google/gemini-2.5-flash-preview",
|
| 55 |
+
"google/gemini-2.5-flash-lite-preview",
|
| 56 |
+
"google/gemini-2.5-flash-preview-04-17",
|
| 57 |
+
"google/gemini-2.5-flash-preview-05-20",
|
| 58 |
+
"google/gemini-2.5-flash-lite-preview-06-17",
|
| 59 |
+
"google/gemini-2.5-pro-preview-06-05",
|
| 60 |
+
"google/gemini-2.5-pro-preview-05-06",
|
| 61 |
]
|
| 62 |
|
| 63 |
transcription_models = [
|
|
|
|
| 159 |
|
| 160 |
|
| 161 |
translate_client = translate.Client()
|
| 162 |
+
google_supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 163 |
|
| 164 |
|
| 165 |
@cache
|
| 166 |
async def translate_google(text, source_language, target_language):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
async with google_rate_limit:
|
| 168 |
response = translate_client.translate(
|
| 169 |
text, source_language=source_language, target_language=target_language
|
|
|
|
| 284 |
["translation_from", "translation_to", "classification", "mmlu", "mgsm"]
|
| 285 |
] * len(models)
|
| 286 |
models = pd.concat([models, get_translation_models()])
|
| 287 |
+
models = models[ # temporary fix FIXME
|
| 288 |
(models["id"] != "google/gemini-2.5-pro")
|
| 289 |
& (models["id"] != "google/gemini-2.5-pro-preview")
|
| 290 |
]
|
evals/tasks.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
| 3 |
from textwrap import dedent
|
|
|
|
| 4 |
import evaluate
|
| 5 |
import pandas as pd
|
| 6 |
import sentencepiece as spm
|
| 7 |
from datasets_.flores import flores_sentences
|
| 8 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 9 |
from datasets_.mmlu import load_mmlu
|
|
|
|
|
|
|
| 10 |
from languages import languages, script_name
|
| 11 |
from models import complete, transcribe, translate_google
|
| 12 |
|
|
@@ -22,6 +25,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
| 22 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 23 |
)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 27 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
@@ -40,9 +46,18 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 40 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 41 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 42 |
if model == "google/translate-v2":
|
| 43 |
-
|
| 44 |
-
|
| 45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
else:
|
| 47 |
prediction = await complete(
|
| 48 |
model=model,
|
|
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
| 3 |
from textwrap import dedent
|
| 4 |
+
|
| 5 |
import evaluate
|
| 6 |
import pandas as pd
|
| 7 |
import sentencepiece as spm
|
| 8 |
from datasets_.flores import flores_sentences
|
| 9 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
+
from google.cloud import translate_v2 as translate
|
| 12 |
+
from langcodes import closest_supported_match
|
| 13 |
from languages import languages, script_name
|
| 14 |
from models import complete, transcribe, translate_google
|
| 15 |
|
|
|
|
| 25 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 26 |
)
|
| 27 |
|
| 28 |
+
translate_client = translate.Client()
|
| 29 |
+
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 30 |
+
|
| 31 |
|
| 32 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 33 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
| 46 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 47 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 48 |
if model == "google/translate-v2":
|
| 49 |
+
original_language = closest_supported_match(
|
| 50 |
+
original_language, supported_languages
|
| 51 |
)
|
| 52 |
+
target_language = closest_supported_match(target_language, supported_languages)
|
| 53 |
+
if original_language == target_language:
|
| 54 |
+
prediction = original_sentence
|
| 55 |
+
elif original_language is None or target_language is None:
|
| 56 |
+
prediction = None
|
| 57 |
+
else:
|
| 58 |
+
prediction = await translate_google(
|
| 59 |
+
original_sentence, original_language.bcp_47, target_language.bcp_47
|
| 60 |
+
)
|
| 61 |
else:
|
| 62 |
prediction = await complete(
|
| 63 |
model=model,
|
evals/translate.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from languages import languages
|
| 2 |
+
from datasets_.mgsm import translate_mgsm
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
translate_mgsm(languages)
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|