Spaces:
Running
Running
Upload from GitHub Actions: flores filter for available dev split
Browse files- README.md +4 -4
- evals/backend.py +2 -2
- evals/datasets_/flores.py +9 -0
- evals/datasets_/util.py +21 -0
- evals/main.py +8 -5
- frontend/public/index.html +2 -2
- frontend/public/manifest.json +2 -2
- frontend/src/App.js +5 -7
- frontend/src/components/CostPlot.js +1 -1
- frontend/src/components/HistoryPlot.js +1 -1
- frontend/src/components/LanguagePlot.js +2 -2
- frontend/src/components/LanguageTierHistoryPlot.js +1 -1
- frontend/src/components/LicenseHistoryPlot.js +1 -1
- frontend/src/components/ScoreColumns.js +2 -2
- frontend/src/components/WorldMap.js +1 -1
- notes/system-architecture-diagram.md +2 -2
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🌍
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: pink
|
| 6 |
sdk: docker
|
| 7 |
app_port: 8000
|
| 8 |
license: cc-by-sa-4.0
|
| 9 |
-
short_description:
|
| 10 |
datasets:
|
| 11 |
- openlanguagedata/flores_plus
|
| 12 |
- google/fleurs
|
|
@@ -39,9 +39,9 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
|
|
| 39 |
|
| 40 |
[](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
|
| 41 |
|
| 42 |
-
#
|
| 43 |
|
| 44 |
-
|
| 45 |
|
| 46 |
## Evaluate
|
| 47 |
|
|
|
|
| 1 |
---
|
| 2 |
+
title: languagebench
|
| 3 |
emoji: 🌍
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: pink
|
| 6 |
sdk: docker
|
| 7 |
app_port: 8000
|
| 8 |
license: cc-by-sa-4.0
|
| 9 |
+
short_description: AI model evaluations for every language in the world.
|
| 10 |
datasets:
|
| 11 |
- openlanguagedata/flores_plus
|
| 12 |
- google/fleurs
|
|
|
|
| 39 |
|
| 40 |
[](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
|
| 41 |
|
| 42 |
+
# languagebench 🌍
|
| 43 |
|
| 44 |
+
_AI model evaluations for every language in the world_
|
| 45 |
|
| 46 |
## Evaluate
|
| 47 |
|
evals/backend.py
CHANGED
|
@@ -267,7 +267,7 @@ def make_language_tier_history(scores_df, languages, models):
|
|
| 267 |
)
|
| 268 |
tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
|
| 269 |
|
| 270 |
-
# Calculate model-language
|
| 271 |
scores_df = scores_df.copy()
|
| 272 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 273 |
pivot = scores_df.pivot_table(
|
|
@@ -315,7 +315,7 @@ def make_license_history(scores_df, models):
|
|
| 315 |
scores_df = scores_df.copy()
|
| 316 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 317 |
|
| 318 |
-
# Pivot and compute
|
| 319 |
pivot = scores_df.pivot_table(
|
| 320 |
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 321 |
)
|
|
|
|
| 267 |
)
|
| 268 |
tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
|
| 269 |
|
| 270 |
+
# Calculate model-language overall scores
|
| 271 |
scores_df = scores_df.copy()
|
| 272 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 273 |
pivot = scores_df.pivot_table(
|
|
|
|
| 315 |
scores_df = scores_df.copy()
|
| 316 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 317 |
|
| 318 |
+
# Pivot and compute overall score
|
| 319 |
pivot = scores_df.pivot_table(
|
| 320 |
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 321 |
)
|
evals/datasets_/flores.py
CHANGED
|
@@ -27,7 +27,16 @@ def aggregate_flores_paths(flores_paths):
|
|
| 27 |
return flores_paths.values[populations.index(max(populations))]
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
flores = pd.DataFrame(splits, columns=["flores_path"])
|
|
|
|
|
|
|
| 31 |
flores["bcp_47"] = flores["flores_path"].apply(
|
| 32 |
lambda x: standardize_bcp47(x, macro=True),
|
| 33 |
)
|
|
|
|
| 27 |
return flores_paths.values[populations.index(max(populations))]
|
| 28 |
|
| 29 |
|
| 30 |
+
def has_dev_split(flores_path):
|
| 31 |
+
try:
|
| 32 |
+
_load_dataset(slug, subset=flores_path, split="dev")
|
| 33 |
+
return True
|
| 34 |
+
except (ValueError, FileNotFoundError):
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
flores = pd.DataFrame(splits, columns=["flores_path"])
|
| 38 |
+
# Filter to only languages with 'dev' split
|
| 39 |
+
flores = flores[flores["flores_path"].apply(has_dev_split)]
|
| 40 |
flores["bcp_47"] = flores["flores_path"].apply(
|
| 41 |
lambda x: standardize_bcp47(x, macro=True),
|
| 42 |
)
|
evals/datasets_/util.py
CHANGED
|
@@ -63,3 +63,24 @@ def save(df: pd.DataFrame, fname: str):
|
|
| 63 |
ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
|
| 64 |
Path("results").mkdir(exist_ok=True)
|
| 65 |
df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
|
| 64 |
Path("results").mkdir(exist_ok=True)
|
| 65 |
df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def get_valid_task_languages(task_name: str) -> set:
|
| 69 |
+
"""Return set of bcp_47 codes that have data available for the given task."""
|
| 70 |
+
from datasets_.flores import flores, splits
|
| 71 |
+
from datasets_.mmlu import tags_afrimmlu, tags_global_mmlu, tags_mmlu_autotranslated
|
| 72 |
+
from datasets_.arc import tags_uhura_arc_easy, tags_uhura_arc_easy_translated
|
| 73 |
+
from datasets_.truthfulqa import tags_uhura_truthfulqa
|
| 74 |
+
from datasets_.mgsm import tags_mgsm, tags_afrimgsm, tags_gsm8kx, tags_gsm_autotranslated
|
| 75 |
+
|
| 76 |
+
if task_name in ["translation_from", "translation_to", "classification"]:
|
| 77 |
+
return set(flores["bcp_47"])
|
| 78 |
+
elif task_name == "mmlu":
|
| 79 |
+
return set([*tags_afrimmlu.keys(), *tags_global_mmlu.keys(), *tags_mmlu_autotranslated.keys()])
|
| 80 |
+
elif task_name == "arc":
|
| 81 |
+
return set([*tags_uhura_arc_easy.keys(), *tags_uhura_arc_easy_translated.keys()])
|
| 82 |
+
elif task_name == "truthfulqa":
|
| 83 |
+
return set(tags_uhura_truthfulqa.keys())
|
| 84 |
+
elif task_name == "mgsm":
|
| 85 |
+
return set([*tags_mgsm.keys(), *tags_afrimgsm.keys(), *tags_gsm8kx.keys(), *tags_gsm_autotranslated.keys()])
|
| 86 |
+
return set()
|
evals/main.py
CHANGED
|
@@ -9,12 +9,12 @@ from models import models
|
|
| 9 |
from rich import print
|
| 10 |
from tasks import tasks
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
-
from datasets_.util import load, save
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
n_sentences = int(environ.get("N_SENTENCES", 10))
|
| 16 |
-
n_languages = int(environ.get("N_LANGUAGES",
|
| 17 |
-
n_models = int(environ.get("N_MODELS",
|
| 18 |
|
| 19 |
async def evaluate():
|
| 20 |
start_time = time.time()
|
|
@@ -22,14 +22,17 @@ async def evaluate():
|
|
| 22 |
# Pre-compute model tasks to avoid O(n²) lookups
|
| 23 |
model_tasks = models.set_index("id")["tasks"].to_dict()
|
| 24 |
|
| 25 |
-
#
|
|
|
|
|
|
|
|
|
|
| 26 |
combis = [
|
| 27 |
(task_name, model, lang.bcp_47, i)
|
| 28 |
for i in range(n_sentences)
|
| 29 |
for lang in languages.head(n_languages).itertuples()
|
| 30 |
for task_name, task in tasks.items()
|
| 31 |
for model in models.iloc[:n_models]["id"]
|
| 32 |
-
if task_name in model_tasks[model]
|
| 33 |
]
|
| 34 |
combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
|
| 35 |
|
|
|
|
| 9 |
from rich import print
|
| 10 |
from tasks import tasks
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
+
from datasets_.util import load, save, get_valid_task_languages
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
n_sentences = int(environ.get("N_SENTENCES", 10))
|
| 16 |
+
n_languages = int(environ.get("N_LANGUAGES", 1000))
|
| 17 |
+
n_models = int(environ.get("N_MODELS", 40))
|
| 18 |
|
| 19 |
async def evaluate():
|
| 20 |
start_time = time.time()
|
|
|
|
| 22 |
# Pre-compute model tasks to avoid O(n²) lookups
|
| 23 |
model_tasks = models.set_index("id")["tasks"].to_dict()
|
| 24 |
|
| 25 |
+
# Pre-compute valid languages for each task
|
| 26 |
+
valid_task_langs = {task_name: get_valid_task_languages(task_name) for task_name in tasks}
|
| 27 |
+
|
| 28 |
+
# get all combinations that need evaluation (filtering invalid lang×task combos)
|
| 29 |
combis = [
|
| 30 |
(task_name, model, lang.bcp_47, i)
|
| 31 |
for i in range(n_sentences)
|
| 32 |
for lang in languages.head(n_languages).itertuples()
|
| 33 |
for task_name, task in tasks.items()
|
| 34 |
for model in models.iloc[:n_models]["id"]
|
| 35 |
+
if task_name in model_tasks[model] and lang.bcp_47 in valid_task_langs[task_name]
|
| 36 |
]
|
| 37 |
combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
|
| 38 |
|
frontend/public/index.html
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
<meta name="theme-color" content="#000000" />
|
| 8 |
<meta
|
| 9 |
name="description"
|
| 10 |
-
content="AI Language
|
| 11 |
/>
|
| 12 |
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
|
| 13 |
<!--
|
|
@@ -24,7 +24,7 @@
|
|
| 24 |
work correctly both with client-side routing and a non-root public URL.
|
| 25 |
Learn how to configure a non-root public URL by running `npm run build`.
|
| 26 |
-->
|
| 27 |
-
<title>AI Language
|
| 28 |
</head>
|
| 29 |
<body>
|
| 30 |
<noscript>You need to enable JavaScript to run this app.</noscript>
|
|
|
|
| 7 |
<meta name="theme-color" content="#000000" />
|
| 8 |
<meta
|
| 9 |
name="description"
|
| 10 |
+
content="AI Language Benchmarks – model evaluations for every language in the world"
|
| 11 |
/>
|
| 12 |
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
|
| 13 |
<!--
|
|
|
|
| 24 |
work correctly both with client-side routing and a non-root public URL.
|
| 25 |
Learn how to configure a non-root public URL by running `npm run build`.
|
| 26 |
-->
|
| 27 |
+
<title>AI Language Benchmarks</title>
|
| 28 |
</head>
|
| 29 |
<body>
|
| 30 |
<noscript>You need to enable JavaScript to run this app.</noscript>
|
frontend/public/manifest.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
-
"short_name": "
|
| 3 |
-
"name": "
|
| 4 |
"icons": [
|
| 5 |
{
|
| 6 |
"src": "favicon.ico",
|
|
|
|
| 1 |
{
|
| 2 |
+
"short_name": "languagebench",
|
| 3 |
+
"name": "AI Language Benchmarks",
|
| 4 |
"icons": [
|
| 5 |
{
|
| 6 |
"src": "favicon.ico",
|
frontend/src/App.js
CHANGED
|
@@ -237,7 +237,7 @@ function App () {
|
|
| 237 |
letterSpacing: '-0.01em'
|
| 238 |
}}
|
| 239 |
>
|
| 240 |
-
AI Language
|
| 241 |
</h1>
|
| 242 |
<p
|
| 243 |
style={{
|
|
@@ -249,7 +249,7 @@ function App () {
|
|
| 249 |
lineHeight: '1.5'
|
| 250 |
}}
|
| 251 |
>
|
| 252 |
-
|
| 253 |
</p>
|
| 254 |
|
| 255 |
<div
|
|
@@ -449,8 +449,7 @@ function App () {
|
|
| 449 |
>
|
| 450 |
<div>
|
| 451 |
<p>
|
| 452 |
-
|
| 453 |
-
multilingual evaluation results of AI language models.
|
| 454 |
</p>
|
| 455 |
<h4>Who is this for?</h4>
|
| 456 |
<ul>
|
|
@@ -463,8 +462,7 @@ function App () {
|
|
| 463 |
neglected languages.
|
| 464 |
</li>
|
| 465 |
<li>
|
| 466 |
-
<b>Model developers</b> can compete on our
|
| 467 |
-
<i>AI Language Proficiency</i> metric.
|
| 468 |
</li>
|
| 469 |
</ul>
|
| 470 |
<h4>⚡ Live Updates</h4>
|
|
@@ -482,7 +480,7 @@ function App () {
|
|
| 482 |
</p>
|
| 483 |
<h4>Authors</h4>
|
| 484 |
<p>
|
| 485 |
-
|
| 486 |
BMZ's{' '}
|
| 487 |
<a
|
| 488 |
href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'
|
|
|
|
| 237 |
letterSpacing: '-0.01em'
|
| 238 |
}}
|
| 239 |
>
|
| 240 |
+
AI Language Benchmarks
|
| 241 |
</h1>
|
| 242 |
<p
|
| 243 |
style={{
|
|
|
|
| 249 |
lineHeight: '1.5'
|
| 250 |
}}
|
| 251 |
>
|
| 252 |
+
AI model evaluations for every language in the world
|
| 253 |
</p>
|
| 254 |
|
| 255 |
<div
|
|
|
|
| 449 |
>
|
| 450 |
<div>
|
| 451 |
<p>
|
| 452 |
+
<i>languagebench</i> provides AI model evaluations for every language in the world.
|
|
|
|
| 453 |
</p>
|
| 454 |
<h4>Who is this for?</h4>
|
| 455 |
<ul>
|
|
|
|
| 462 |
neglected languages.
|
| 463 |
</li>
|
| 464 |
<li>
|
| 465 |
+
<b>Model developers</b> can compete on our benchmarks.
|
|
|
|
| 466 |
</li>
|
| 467 |
</ul>
|
| 468 |
<h4>⚡ Live Updates</h4>
|
|
|
|
| 480 |
</p>
|
| 481 |
<h4>Authors</h4>
|
| 482 |
<p>
|
| 483 |
+
languagebench is a collaboration between
|
| 484 |
BMZ's{' '}
|
| 485 |
<a
|
| 486 |
href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'
|
frontend/src/components/CostPlot.js
CHANGED
|
@@ -31,7 +31,7 @@ const CostPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 31 |
tickFormat: d => USDollar.format(d)
|
| 32 |
},
|
| 33 |
y: {
|
| 34 |
-
label: '
|
| 35 |
},
|
| 36 |
symbol: {
|
| 37 |
legend: true
|
|
|
|
| 31 |
tickFormat: d => USDollar.format(d)
|
| 32 |
},
|
| 33 |
y: {
|
| 34 |
+
label: 'Overall Score'
|
| 35 |
},
|
| 36 |
symbol: {
|
| 37 |
legend: true
|
frontend/src/components/HistoryPlot.js
CHANGED
|
@@ -26,7 +26,7 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 26 |
tickFormat: '%Y-%m'
|
| 27 |
},
|
| 28 |
y: {
|
| 29 |
-
label: '
|
| 30 |
},
|
| 31 |
symbol: {
|
| 32 |
legend: true
|
|
|
|
| 26 |
tickFormat: '%Y-%m'
|
| 27 |
},
|
| 28 |
y: {
|
| 29 |
+
label: 'Overall Score'
|
| 30 |
},
|
| 31 |
symbol: {
|
| 32 |
legend: true
|
frontend/src/components/LanguagePlot.js
CHANGED
|
@@ -9,13 +9,13 @@ const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
|
| 9 |
const plot = Plot.plot({
|
| 10 |
width: width,
|
| 11 |
height: height,
|
| 12 |
-
subtitle: '
|
| 13 |
x: {
|
| 14 |
label: 'Number of Speakers',
|
| 15 |
type: 'log'
|
| 16 |
},
|
| 17 |
y: {
|
| 18 |
-
label: '
|
| 19 |
},
|
| 20 |
marks: [
|
| 21 |
Plot.dot(languages, {
|
|
|
|
| 9 |
const plot = Plot.plot({
|
| 10 |
width: width,
|
| 11 |
height: height,
|
| 12 |
+
subtitle: 'Overall scores by language',
|
| 13 |
x: {
|
| 14 |
label: 'Number of Speakers',
|
| 15 |
type: 'log'
|
| 16 |
},
|
| 17 |
y: {
|
| 18 |
+
label: 'Overall score'
|
| 19 |
},
|
| 20 |
marks: [
|
| 21 |
Plot.dot(languages, {
|
frontend/src/components/LanguageTierHistoryPlot.js
CHANGED
|
@@ -76,7 +76,7 @@ const LanguageTierHistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 76 |
tickFormat: '%Y-%m'
|
| 77 |
},
|
| 78 |
y: {
|
| 79 |
-
label: 'Language Tier
|
| 80 |
},
|
| 81 |
color: {
|
| 82 |
legend: true,
|
|
|
|
| 76 |
tickFormat: '%Y-%m'
|
| 77 |
},
|
| 78 |
y: {
|
| 79 |
+
label: 'Overall Score by Language Tier'
|
| 80 |
},
|
| 81 |
color: {
|
| 82 |
legend: true,
|
frontend/src/components/LicenseHistoryPlot.js
CHANGED
|
@@ -65,7 +65,7 @@ const LicenseHistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 65 |
tickFormat: '%Y-%m'
|
| 66 |
},
|
| 67 |
y: {
|
| 68 |
-
label: '
|
| 69 |
},
|
| 70 |
color: {
|
| 71 |
legend: true,
|
|
|
|
| 65 |
tickFormat: '%Y-%m'
|
| 66 |
},
|
| 67 |
y: {
|
| 68 |
+
label: 'Overall Score'
|
| 69 |
},
|
| 70 |
color: {
|
| 71 |
legend: true,
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -59,8 +59,8 @@ const createScoreColumn = (
|
|
| 59 |
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
| 60 |
createScoreColumn(
|
| 61 |
'average',
|
| 62 |
-
'
|
| 63 |
-
'
|
| 64 |
0,
|
| 65 |
1,
|
| 66 |
machineTranslatedMetrics
|
|
|
|
| 59 |
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
| 60 |
createScoreColumn(
|
| 61 |
'average',
|
| 62 |
+
'Overall',
|
| 63 |
+
'Overall Score (average of the scores for each task)',
|
| 64 |
0,
|
| 65 |
1,
|
| 66 |
machineTranslatedMetrics
|
frontend/src/components/WorldMap.js
CHANGED
|
@@ -63,7 +63,7 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
|
| 63 |
}).length
|
| 64 |
|
| 65 |
const plot = Plot.plot({
|
| 66 |
-
subtitle: `
|
| 67 |
width: width,
|
| 68 |
height: height,
|
| 69 |
projection: 'equal-earth',
|
|
|
|
| 63 |
}).length
|
| 64 |
|
| 65 |
const plot = Plot.plot({
|
| 66 |
+
subtitle: `Overall Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
|
| 67 |
width: width,
|
| 68 |
height: height,
|
| 69 |
projection: 'equal-earth',
|
notes/system-architecture-diagram.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
\[AI-generated, not 100% up-to-date\]
|
| 4 |
|
|
@@ -155,7 +155,7 @@ flowchart TD
|
|
| 155 |
- **languages.json**: Language information with population data
|
| 156 |
|
| 157 |
### 🟡 Frontend Visualization (Light Red)
|
| 158 |
-
- **WorldMap**: Interactive country-level
|
| 159 |
- **ModelTable**: Ranked model performance leaderboard with origin-specific columns
|
| 160 |
- **LanguageTable**: Language coverage and speaker statistics
|
| 161 |
- **DatasetTable**: Task-specific performance breakdowns with human/machine distinction
|
|
|
|
| 1 |
+
# languagebench - System Architecture
|
| 2 |
|
| 3 |
\[AI-generated, not 100% up-to-date\]
|
| 4 |
|
|
|
|
| 155 |
- **languages.json**: Language information with population data
|
| 156 |
|
| 157 |
### 🟡 Frontend Visualization (Light Red)
|
| 158 |
+
- **WorldMap**: Interactive country-level visualization
|
| 159 |
- **ModelTable**: Ranked model performance leaderboard with origin-specific columns
|
| 160 |
- **LanguageTable**: Language coverage and speaker statistics
|
| 161 |
- **DatasetTable**: Task-specific performance breakdowns with human/machine distinction
|