Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
8beab26
1
Parent(s):
df383f6
Add CommonVoice stats
Browse filesBut matching of codes does not yet work well, e.g. zh-CN vs cmn
README.md
CHANGED
|
@@ -8,6 +8,7 @@ license: cc-by-sa-4.0
|
|
| 8 |
short_description: Evaluating LLM performance across all human languages.
|
| 9 |
datasets:
|
| 10 |
- openlanguagedata/flores_plus
|
|
|
|
| 11 |
models:
|
| 12 |
- meta-llama/Llama-3.3-70B-Instruct
|
| 13 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
|
|
|
| 8 |
short_description: Evaluating LLM performance across all human languages.
|
| 9 |
datasets:
|
| 10 |
- openlanguagedata/flores_plus
|
| 11 |
+
- mozilla-foundation/common_voice_1_0
|
| 12 |
models:
|
| 13 |
- meta-llama/Llama-3.3-70B-Instruct
|
| 14 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
app.py
CHANGED
|
@@ -189,6 +189,7 @@ def create_language_stats_df(results):
|
|
| 189 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
| 190 |
if best_score["bleu"] is not None
|
| 191 |
else "N/A",
|
|
|
|
| 192 |
}
|
| 193 |
flat_data.append(row)
|
| 194 |
|
|
|
|
| 189 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
| 190 |
if best_score["bleu"] is not None
|
| 191 |
else "N/A",
|
| 192 |
+
"CommonVoice Hours": lang["commonvoice_hours"],
|
| 193 |
}
|
| 194 |
flat_data.append(row)
|
| 195 |
|
evals.py
CHANGED
|
@@ -12,6 +12,8 @@ from joblib.memory import Memory
|
|
| 12 |
from openai import AsyncOpenAI
|
| 13 |
from tqdm.asyncio import tqdm_asyncio
|
| 14 |
from transformers import NllbTokenizer
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# config
|
| 17 |
models = [
|
|
@@ -68,7 +70,7 @@ language_names = (
|
|
| 68 |
language_stats = (
|
| 69 |
pd.read_csv("data/languages.tsv", sep="\t")
|
| 70 |
.rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
|
| 71 |
-
["language_code", "speakers"]
|
| 72 |
]
|
| 73 |
.dropna(subset=["language_code"])
|
| 74 |
)
|
|
@@ -97,8 +99,15 @@ languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
|
|
| 97 |
languages = languages.sort_values(by="speakers", ascending=False)
|
| 98 |
languages = languages.iloc[:30]
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# sample languages to translate to
|
| 101 |
-
|
| 102 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
| 103 |
)
|
| 104 |
# sample languages to analyze with all models
|
|
@@ -170,7 +179,7 @@ async def main():
|
|
| 170 |
print(name)
|
| 171 |
scores = []
|
| 172 |
if language.in_benchmark:
|
| 173 |
-
|
| 174 |
for model in models:
|
| 175 |
if (
|
| 176 |
model != fast_model
|
|
@@ -184,16 +193,16 @@ async def main():
|
|
| 184 |
translate(
|
| 185 |
model, language.language_name, language.script_name, sentence
|
| 186 |
)
|
| 187 |
-
for sentence, language in zip(
|
| 188 |
]
|
| 189 |
predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
|
| 190 |
-
|
| 191 |
load_sentences(lang)[i]
|
| 192 |
-
for i, lang in enumerate(
|
| 193 |
]
|
| 194 |
metrics_bleu = bleu.compute(
|
| 195 |
predictions=predictions,
|
| 196 |
-
references=
|
| 197 |
tokenizer=tokenizer.tokenize,
|
| 198 |
)
|
| 199 |
# metrics_bert = bertscore.compute(
|
|
@@ -208,6 +217,8 @@ async def main():
|
|
| 208 |
# "bert_score": mean(metrics_bert["f1"]),
|
| 209 |
}
|
| 210 |
)
|
|
|
|
|
|
|
| 211 |
results.append(
|
| 212 |
{
|
| 213 |
"language_name": name,
|
|
@@ -216,6 +227,7 @@ async def main():
|
|
| 216 |
"scores": scores,
|
| 217 |
"bleu": mean([s["bleu"] for s in scores]) if scores else None,
|
| 218 |
# "bert_score": mean([s["bert_score"] for s in scores]),
|
|
|
|
| 219 |
}
|
| 220 |
)
|
| 221 |
with open("results.json", "w") as f:
|
|
|
|
| 12 |
from openai import AsyncOpenAI
|
| 13 |
from tqdm.asyncio import tqdm_asyncio
|
| 14 |
from transformers import NllbTokenizer
|
| 15 |
+
from datetime import date
|
| 16 |
+
from requests import get
|
| 17 |
|
| 18 |
# config
|
| 19 |
models = [
|
|
|
|
| 70 |
language_stats = (
|
| 71 |
pd.read_csv("data/languages.tsv", sep="\t")
|
| 72 |
.rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
|
| 73 |
+
["language_code", "speakers", "iso639_1"]
|
| 74 |
]
|
| 75 |
.dropna(subset=["language_code"])
|
| 76 |
)
|
|
|
|
| 99 |
languages = languages.sort_values(by="speakers", ascending=False)
|
| 100 |
languages = languages.iloc[:30]
|
| 101 |
|
| 102 |
+
# retrieve CommonVoice stats
|
| 103 |
+
@cache # cache for 1 day
|
| 104 |
+
def get_commonvoice_stats(date: date):
|
| 105 |
+
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
|
| 106 |
+
|
| 107 |
+
commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today()))
|
| 108 |
+
|
| 109 |
# sample languages to translate to
|
| 110 |
+
target_languages = languages[languages["in_benchmark"]].sample(
|
| 111 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
| 112 |
)
|
| 113 |
# sample languages to analyze with all models
|
|
|
|
| 179 |
print(name)
|
| 180 |
scores = []
|
| 181 |
if language.in_benchmark:
|
| 182 |
+
original_sentences = load_sentences(language)[:n_sentences]
|
| 183 |
for model in models:
|
| 184 |
if (
|
| 185 |
model != fast_model
|
|
|
|
| 193 |
translate(
|
| 194 |
model, language.language_name, language.script_name, sentence
|
| 195 |
)
|
| 196 |
+
for sentence, language in zip(original_sentences, target_languages.itertuples())
|
| 197 |
]
|
| 198 |
predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
|
| 199 |
+
target_sentences = [
|
| 200 |
load_sentences(lang)[i]
|
| 201 |
+
for i, lang in enumerate(target_languages.itertuples())
|
| 202 |
]
|
| 203 |
metrics_bleu = bleu.compute(
|
| 204 |
predictions=predictions,
|
| 205 |
+
references=target_sentences,
|
| 206 |
tokenizer=tokenizer.tokenize,
|
| 207 |
)
|
| 208 |
# metrics_bert = bertscore.compute(
|
|
|
|
| 217 |
# "bert_score": mean(metrics_bert["f1"]),
|
| 218 |
}
|
| 219 |
)
|
| 220 |
+
commonvoice_hours = commonvoice_stats[commonvoice_stats["locale"] == language.iso639_1]["validatedHours"].values
|
| 221 |
+
commonvoice_hours = commonvoice_hours[0] if commonvoice_hours.size > 0 else "N/A"
|
| 222 |
results.append(
|
| 223 |
{
|
| 224 |
"language_name": name,
|
|
|
|
| 227 |
"scores": scores,
|
| 228 |
"bleu": mean([s["bleu"] for s in scores]) if scores else None,
|
| 229 |
# "bert_score": mean([s["bert_score"] for s in scores]),
|
| 230 |
+
"commonvoice_hours": commonvoice_hours,
|
| 231 |
}
|
| 232 |
)
|
| 233 |
with open("results.json", "w") as f:
|
results.json
CHANGED
|
@@ -29,7 +29,8 @@
|
|
| 29 |
"bleu": 0.44668905281921456
|
| 30 |
}
|
| 31 |
],
|
| 32 |
-
"bleu": 0.47384102687918905
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"language_name": "Mandarin Chinese",
|
|
@@ -41,7 +42,8 @@
|
|
| 41 |
"bleu": 0.48254866511762295
|
| 42 |
}
|
| 43 |
],
|
| 44 |
-
"bleu": 0.48254866511762295
|
|
|
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"language_name": "Spanish",
|
|
@@ -53,7 +55,8 @@
|
|
| 53 |
"bleu": 0.31606621368361204
|
| 54 |
}
|
| 55 |
],
|
| 56 |
-
"bleu": 0.31606621368361204
|
|
|
|
| 57 |
},
|
| 58 |
{
|
| 59 |
"language_name": "Hindi",
|
|
@@ -65,7 +68,8 @@
|
|
| 65 |
"bleu": 0.3273225856613046
|
| 66 |
}
|
| 67 |
],
|
| 68 |
-
"bleu": 0.3273225856613046
|
|
|
|
| 69 |
},
|
| 70 |
{
|
| 71 |
"language_name": "Bengali",
|
|
@@ -77,7 +81,8 @@
|
|
| 77 |
"bleu": 0.23110496173302814
|
| 78 |
}
|
| 79 |
],
|
| 80 |
-
"bleu": 0.23110496173302814
|
|
|
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"language_name": "Portuguese",
|
|
@@ -89,7 +94,8 @@
|
|
| 89 |
"bleu": 0.35032125995743685
|
| 90 |
}
|
| 91 |
],
|
| 92 |
-
"bleu": 0.35032125995743685
|
|
|
|
| 93 |
},
|
| 94 |
{
|
| 95 |
"language_name": "French",
|
|
@@ -101,7 +107,8 @@
|
|
| 101 |
"bleu": 0.31625053573185663
|
| 102 |
}
|
| 103 |
],
|
| 104 |
-
"bleu": 0.31625053573185663
|
|
|
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"language_name": "Indonesian",
|
|
@@ -113,7 +120,8 @@
|
|
| 113 |
"bleu": 0.3112185444311794
|
| 114 |
}
|
| 115 |
],
|
| 116 |
-
"bleu": 0.3112185444311794
|
|
|
|
| 117 |
},
|
| 118 |
{
|
| 119 |
"language_name": "Russian",
|
|
@@ -145,7 +153,8 @@
|
|
| 145 |
"bleu": 0.31289371159965956
|
| 146 |
}
|
| 147 |
],
|
| 148 |
-
"bleu": 0.3346024224541269
|
|
|
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"language_name": "Japanese",
|
|
@@ -177,7 +186,8 @@
|
|
| 177 |
"bleu": 0.2585222780278109
|
| 178 |
}
|
| 179 |
],
|
| 180 |
-
"bleu": 0.2790237571605942
|
|
|
|
| 181 |
},
|
| 182 |
{
|
| 183 |
"language_name": "Eastern Punjabi",
|
|
@@ -189,7 +199,8 @@
|
|
| 189 |
"bleu": 0.27325501919134315
|
| 190 |
}
|
| 191 |
],
|
| 192 |
-
"bleu": 0.27325501919134315
|
|
|
|
| 193 |
},
|
| 194 |
{
|
| 195 |
"language_name": "Standard German",
|
|
@@ -221,7 +232,8 @@
|
|
| 221 |
"bleu": 0.36047992103881465
|
| 222 |
}
|
| 223 |
],
|
| 224 |
-
"bleu": 0.3898869846770727
|
|
|
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"language_name": "Egyptian Arabic",
|
|
@@ -253,7 +265,8 @@
|
|
| 253 |
"bleu": 0.19969813973959594
|
| 254 |
}
|
| 255 |
],
|
| 256 |
-
"bleu": 0.23482952277259375
|
|
|
|
| 257 |
},
|
| 258 |
{
|
| 259 |
"language_name": "Urdu",
|
|
@@ -285,7 +298,8 @@
|
|
| 285 |
"bleu": 0.2285337340113323
|
| 286 |
}
|
| 287 |
],
|
| 288 |
-
"bleu": 0.2690020545084802
|
|
|
|
| 289 |
},
|
| 290 |
{
|
| 291 |
"language_name": "Filipino",
|
|
@@ -297,7 +311,8 @@
|
|
| 297 |
"bleu": 0.33268969497468076
|
| 298 |
}
|
| 299 |
],
|
| 300 |
-
"bleu": 0.33268969497468076
|
|
|
|
| 301 |
},
|
| 302 |
{
|
| 303 |
"language_name": "Javanese",
|
|
@@ -309,7 +324,8 @@
|
|
| 309 |
"bleu": 0.2528746866064681
|
| 310 |
}
|
| 311 |
],
|
| 312 |
-
"bleu": 0.2528746866064681
|
|
|
|
| 313 |
},
|
| 314 |
{
|
| 315 |
"language_name": "Marathi",
|
|
@@ -321,7 +337,8 @@
|
|
| 321 |
"bleu": 0.24876051941895777
|
| 322 |
}
|
| 323 |
],
|
| 324 |
-
"bleu": 0.24876051941895777
|
|
|
|
| 325 |
},
|
| 326 |
{
|
| 327 |
"language_name": "Swahili",
|
|
@@ -353,7 +370,8 @@
|
|
| 353 |
"bleu": 0.21803176063271826
|
| 354 |
}
|
| 355 |
],
|
| 356 |
-
"bleu": 0.3070798470243923
|
|
|
|
| 357 |
},
|
| 358 |
{
|
| 359 |
"language_name": "Turkish",
|
|
@@ -365,7 +383,8 @@
|
|
| 365 |
"bleu": 0.29874140544434125
|
| 366 |
}
|
| 367 |
],
|
| 368 |
-
"bleu": 0.29874140544434125
|
|
|
|
| 369 |
},
|
| 370 |
{
|
| 371 |
"language_name": "Telugu",
|
|
@@ -377,14 +396,16 @@
|
|
| 377 |
"bleu": 0.28869836899054496
|
| 378 |
}
|
| 379 |
],
|
| 380 |
-
"bleu": 0.28869836899054496
|
|
|
|
| 381 |
},
|
| 382 |
{
|
| 383 |
"language_name": "Wu Chinese",
|
| 384 |
"language_code": "wuu",
|
| 385 |
"speakers": 81400000.0,
|
| 386 |
"scores": [],
|
| 387 |
-
"bleu": null
|
|
|
|
| 388 |
},
|
| 389 |
{
|
| 390 |
"language_name": "Korean",
|
|
@@ -396,7 +417,8 @@
|
|
| 396 |
"bleu": 0.2566453806044083
|
| 397 |
}
|
| 398 |
],
|
| 399 |
-
"bleu": 0.2566453806044083
|
|
|
|
| 400 |
},
|
| 401 |
{
|
| 402 |
"language_name": "Vietnamese",
|
|
@@ -428,7 +450,8 @@
|
|
| 428 |
"bleu": 0.18355331419148843
|
| 429 |
}
|
| 430 |
],
|
| 431 |
-
"bleu": 0.3011065238905742
|
|
|
|
| 432 |
},
|
| 433 |
{
|
| 434 |
"language_name": "Tamil",
|
|
@@ -460,7 +483,8 @@
|
|
| 460 |
"bleu": 0.12646276530642359
|
| 461 |
}
|
| 462 |
],
|
| 463 |
-
"bleu": 0.23483954884287706
|
|
|
|
| 464 |
},
|
| 465 |
{
|
| 466 |
"language_name": "Yue Chinese",
|
|
@@ -472,7 +496,8 @@
|
|
| 472 |
"bleu": 0.2663995648378034
|
| 473 |
}
|
| 474 |
],
|
| 475 |
-
"bleu": 0.2663995648378034
|
|
|
|
| 476 |
},
|
| 477 |
{
|
| 478 |
"language_name": "Italian",
|
|
@@ -484,7 +509,8 @@
|
|
| 484 |
"bleu": 0.3190660116366235
|
| 485 |
}
|
| 486 |
],
|
| 487 |
-
"bleu": 0.3190660116366235
|
|
|
|
| 488 |
},
|
| 489 |
{
|
| 490 |
"language_name": "Gujarati",
|
|
@@ -516,7 +542,8 @@
|
|
| 516 |
"bleu": 0.19669824113063106
|
| 517 |
}
|
| 518 |
],
|
| 519 |
-
"bleu": 0.2589873172783296
|
|
|
|
| 520 |
},
|
| 521 |
{
|
| 522 |
"language_name": "Iranian Persian",
|
|
@@ -528,7 +555,8 @@
|
|
| 528 |
"bleu": 0.28359916806993934
|
| 529 |
}
|
| 530 |
],
|
| 531 |
-
"bleu": 0.28359916806993934
|
|
|
|
| 532 |
},
|
| 533 |
{
|
| 534 |
"language_name": "Bhojpuri",
|
|
@@ -540,13 +568,15 @@
|
|
| 540 |
"bleu": 0.24311504988281543
|
| 541 |
}
|
| 542 |
],
|
| 543 |
-
"bleu": 0.24311504988281543
|
|
|
|
| 544 |
},
|
| 545 |
{
|
| 546 |
"language_name": "Hakka Chinese",
|
| 547 |
"language_code": "hak",
|
| 548 |
"speakers": 48200000.0,
|
| 549 |
"scores": [],
|
| 550 |
-
"bleu": null
|
|
|
|
| 551 |
}
|
| 552 |
]
|
|
|
|
| 29 |
"bleu": 0.44668905281921456
|
| 30 |
}
|
| 31 |
],
|
| 32 |
+
"bleu": 0.47384102687918905,
|
| 33 |
+
"commonvoice_hours": 2649.0
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"language_name": "Mandarin Chinese",
|
|
|
|
| 42 |
"bleu": 0.48254866511762295
|
| 43 |
}
|
| 44 |
],
|
| 45 |
+
"bleu": 0.48254866511762295,
|
| 46 |
+
"commonvoice_hours": "N/A"
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"language_name": "Spanish",
|
|
|
|
| 55 |
"bleu": 0.31606621368361204
|
| 56 |
}
|
| 57 |
],
|
| 58 |
+
"bleu": 0.31606621368361204,
|
| 59 |
+
"commonvoice_hours": 446.0
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"language_name": "Hindi",
|
|
|
|
| 68 |
"bleu": 0.3273225856613046
|
| 69 |
}
|
| 70 |
],
|
| 71 |
+
"bleu": 0.3273225856613046,
|
| 72 |
+
"commonvoice_hours": 16.0
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"language_name": "Bengali",
|
|
|
|
| 81 |
"bleu": 0.23110496173302814
|
| 82 |
}
|
| 83 |
],
|
| 84 |
+
"bleu": 0.23110496173302814,
|
| 85 |
+
"commonvoice_hours": 49.0
|
| 86 |
},
|
| 87 |
{
|
| 88 |
"language_name": "Portuguese",
|
|
|
|
| 94 |
"bleu": 0.35032125995743685
|
| 95 |
}
|
| 96 |
],
|
| 97 |
+
"bleu": 0.35032125995743685,
|
| 98 |
+
"commonvoice_hours": 176.0
|
| 99 |
},
|
| 100 |
{
|
| 101 |
"language_name": "French",
|
|
|
|
| 107 |
"bleu": 0.31625053573185663
|
| 108 |
}
|
| 109 |
],
|
| 110 |
+
"bleu": 0.31625053573185663,
|
| 111 |
+
"commonvoice_hours": 1051.0
|
| 112 |
},
|
| 113 |
{
|
| 114 |
"language_name": "Indonesian",
|
|
|
|
| 120 |
"bleu": 0.3112185444311794
|
| 121 |
}
|
| 122 |
],
|
| 123 |
+
"bleu": 0.3112185444311794,
|
| 124 |
+
"commonvoice_hours": 33.0
|
| 125 |
},
|
| 126 |
{
|
| 127 |
"language_name": "Russian",
|
|
|
|
| 153 |
"bleu": 0.31289371159965956
|
| 154 |
}
|
| 155 |
],
|
| 156 |
+
"bleu": 0.3346024224541269,
|
| 157 |
+
"commonvoice_hours": 241.0
|
| 158 |
},
|
| 159 |
{
|
| 160 |
"language_name": "Japanese",
|
|
|
|
| 186 |
"bleu": 0.2585222780278109
|
| 187 |
}
|
| 188 |
],
|
| 189 |
+
"bleu": 0.2790237571605942,
|
| 190 |
+
"commonvoice_hours": 222.0
|
| 191 |
},
|
| 192 |
{
|
| 193 |
"language_name": "Eastern Punjabi",
|
|
|
|
| 199 |
"bleu": 0.27325501919134315
|
| 200 |
}
|
| 201 |
],
|
| 202 |
+
"bleu": 0.27325501919134315,
|
| 203 |
+
"commonvoice_hours": "N/A"
|
| 204 |
},
|
| 205 |
{
|
| 206 |
"language_name": "Standard German",
|
|
|
|
| 232 |
"bleu": 0.36047992103881465
|
| 233 |
}
|
| 234 |
],
|
| 235 |
+
"bleu": 0.3898869846770727,
|
| 236 |
+
"commonvoice_hours": 1357.0
|
| 237 |
},
|
| 238 |
{
|
| 239 |
"language_name": "Egyptian Arabic",
|
|
|
|
| 265 |
"bleu": 0.19969813973959594
|
| 266 |
}
|
| 267 |
],
|
| 268 |
+
"bleu": 0.23482952277259375,
|
| 269 |
+
"commonvoice_hours": "N/A"
|
| 270 |
},
|
| 271 |
{
|
| 272 |
"language_name": "Urdu",
|
|
|
|
| 298 |
"bleu": 0.2285337340113323
|
| 299 |
}
|
| 300 |
],
|
| 301 |
+
"bleu": 0.2690020545084802,
|
| 302 |
+
"commonvoice_hours": 76.0
|
| 303 |
},
|
| 304 |
{
|
| 305 |
"language_name": "Filipino",
|
|
|
|
| 311 |
"bleu": 0.33268969497468076
|
| 312 |
}
|
| 313 |
],
|
| 314 |
+
"bleu": 0.33268969497468076,
|
| 315 |
+
"commonvoice_hours": "N/A"
|
| 316 |
},
|
| 317 |
{
|
| 318 |
"language_name": "Javanese",
|
|
|
|
| 324 |
"bleu": 0.2528746866064681
|
| 325 |
}
|
| 326 |
],
|
| 327 |
+
"bleu": 0.2528746866064681,
|
| 328 |
+
"commonvoice_hours": 0.0
|
| 329 |
},
|
| 330 |
{
|
| 331 |
"language_name": "Marathi",
|
|
|
|
| 337 |
"bleu": 0.24876051941895777
|
| 338 |
}
|
| 339 |
],
|
| 340 |
+
"bleu": 0.24876051941895777,
|
| 341 |
+
"commonvoice_hours": 20.0
|
| 342 |
},
|
| 343 |
{
|
| 344 |
"language_name": "Swahili",
|
|
|
|
| 370 |
"bleu": 0.21803176063271826
|
| 371 |
}
|
| 372 |
],
|
| 373 |
+
"bleu": 0.3070798470243923,
|
| 374 |
+
"commonvoice_hours": "N/A"
|
| 375 |
},
|
| 376 |
{
|
| 377 |
"language_name": "Turkish",
|
|
|
|
| 383 |
"bleu": 0.29874140544434125
|
| 384 |
}
|
| 385 |
],
|
| 386 |
+
"bleu": 0.29874140544434125,
|
| 387 |
+
"commonvoice_hours": 127.0
|
| 388 |
},
|
| 389 |
{
|
| 390 |
"language_name": "Telugu",
|
|
|
|
| 396 |
"bleu": 0.28869836899054496
|
| 397 |
}
|
| 398 |
],
|
| 399 |
+
"bleu": 0.28869836899054496,
|
| 400 |
+
"commonvoice_hours": 0.3
|
| 401 |
},
|
| 402 |
{
|
| 403 |
"language_name": "Wu Chinese",
|
| 404 |
"language_code": "wuu",
|
| 405 |
"speakers": 81400000.0,
|
| 406 |
"scores": [],
|
| 407 |
+
"bleu": null,
|
| 408 |
+
"commonvoice_hours": "N/A"
|
| 409 |
},
|
| 410 |
{
|
| 411 |
"language_name": "Korean",
|
|
|
|
| 417 |
"bleu": 0.2566453806044083
|
| 418 |
}
|
| 419 |
],
|
| 420 |
+
"bleu": 0.2566453806044083,
|
| 421 |
+
"commonvoice_hours": 1.7
|
| 422 |
},
|
| 423 |
{
|
| 424 |
"language_name": "Vietnamese",
|
|
|
|
| 450 |
"bleu": 0.18355331419148843
|
| 451 |
}
|
| 452 |
],
|
| 453 |
+
"bleu": 0.3011065238905742,
|
| 454 |
+
"commonvoice_hours": 5.9
|
| 455 |
},
|
| 456 |
{
|
| 457 |
"language_name": "Tamil",
|
|
|
|
| 483 |
"bleu": 0.12646276530642359
|
| 484 |
}
|
| 485 |
],
|
| 486 |
+
"bleu": 0.23483954884287706,
|
| 487 |
+
"commonvoice_hours": 234.0
|
| 488 |
},
|
| 489 |
{
|
| 490 |
"language_name": "Yue Chinese",
|
|
|
|
| 496 |
"bleu": 0.2663995648378034
|
| 497 |
}
|
| 498 |
],
|
| 499 |
+
"bleu": 0.2663995648378034,
|
| 500 |
+
"commonvoice_hours": "N/A"
|
| 501 |
},
|
| 502 |
{
|
| 503 |
"language_name": "Italian",
|
|
|
|
| 509 |
"bleu": 0.3190660116366235
|
| 510 |
}
|
| 511 |
],
|
| 512 |
+
"bleu": 0.3190660116366235,
|
| 513 |
+
"commonvoice_hours": 362.0
|
| 514 |
},
|
| 515 |
{
|
| 516 |
"language_name": "Gujarati",
|
|
|
|
| 542 |
"bleu": 0.19669824113063106
|
| 543 |
}
|
| 544 |
],
|
| 545 |
+
"bleu": 0.2589873172783296,
|
| 546 |
+
"commonvoice_hours": "N/A"
|
| 547 |
},
|
| 548 |
{
|
| 549 |
"language_name": "Iranian Persian",
|
|
|
|
| 555 |
"bleu": 0.28359916806993934
|
| 556 |
}
|
| 557 |
],
|
| 558 |
+
"bleu": 0.28359916806993934,
|
| 559 |
+
"commonvoice_hours": "N/A"
|
| 560 |
},
|
| 561 |
{
|
| 562 |
"language_name": "Bhojpuri",
|
|
|
|
| 568 |
"bleu": 0.24311504988281543
|
| 569 |
}
|
| 570 |
],
|
| 571 |
+
"bleu": 0.24311504988281543,
|
| 572 |
+
"commonvoice_hours": "N/A"
|
| 573 |
},
|
| 574 |
{
|
| 575 |
"language_name": "Hakka Chinese",
|
| 576 |
"language_code": "hak",
|
| 577 |
"speakers": 48200000.0,
|
| 578 |
"scores": [],
|
| 579 |
+
"bleu": null,
|
| 580 |
+
"commonvoice_hours": "N/A"
|
| 581 |
}
|
| 582 |
]
|