davidpomerenke commited on
Commit
34b05c6
·
verified ·
1 Parent(s): aa92add

Upload from GitHub Actions: flores filter for available dev split

Browse files
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: AI Language Monitor
3
  emoji: 🌍
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: docker
7
  app_port: 8000
8
  license: cc-by-sa-4.0
9
- short_description: Evaluating LLM performance across all human languages.
10
  datasets:
11
  - openlanguagedata/flores_plus
12
  - google/fleurs
@@ -39,9 +39,9 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
39
 
40
  [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-purple)](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
41
 
42
- # AI Language Monitor 🌍
43
 
44
- _Tracking language proficiency of AI models for every language_
45
 
46
  ## Evaluate
47
 
 
1
  ---
2
+ title: languagebench
3
  emoji: 🌍
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: docker
7
  app_port: 8000
8
  license: cc-by-sa-4.0
9
+ short_description: AI model evaluations for every language in the world.
10
  datasets:
11
  - openlanguagedata/flores_plus
12
  - google/fleurs
 
39
 
40
  [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-purple)](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
41
 
42
+ # languagebench 🌍
43
 
44
+ _AI model evaluations for every language in the world_
45
 
46
  ## Evaluate
47
 
evals/backend.py CHANGED
@@ -267,7 +267,7 @@ def make_language_tier_history(scores_df, languages, models):
267
  )
268
  tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
269
 
270
- # Calculate model-language proficiency scores
271
  scores_df = scores_df.copy()
272
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
273
  pivot = scores_df.pivot_table(
@@ -315,7 +315,7 @@ def make_license_history(scores_df, models):
315
  scores_df = scores_df.copy()
316
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
317
 
318
- # Pivot and compute proficiency
319
  pivot = scores_df.pivot_table(
320
  index="model", columns="task_metric", values="score", aggfunc="mean"
321
  )
 
267
  )
268
  tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
269
 
270
+ # Calculate model-language overall scores
271
  scores_df = scores_df.copy()
272
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
273
  pivot = scores_df.pivot_table(
 
315
  scores_df = scores_df.copy()
316
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
317
 
318
+ # Pivot and compute overall score
319
  pivot = scores_df.pivot_table(
320
  index="model", columns="task_metric", values="score", aggfunc="mean"
321
  )
evals/datasets_/flores.py CHANGED
@@ -27,7 +27,16 @@ def aggregate_flores_paths(flores_paths):
27
  return flores_paths.values[populations.index(max(populations))]
28
 
29
 
 
 
 
 
 
 
 
30
  flores = pd.DataFrame(splits, columns=["flores_path"])
 
 
31
  flores["bcp_47"] = flores["flores_path"].apply(
32
  lambda x: standardize_bcp47(x, macro=True),
33
  )
 
27
  return flores_paths.values[populations.index(max(populations))]
28
 
29
 
30
+ def has_dev_split(flores_path):
31
+ try:
32
+ _load_dataset(slug, subset=flores_path, split="dev")
33
+ return True
34
+ except (ValueError, FileNotFoundError):
35
+ return False
36
+
37
  flores = pd.DataFrame(splits, columns=["flores_path"])
38
+ # Filter to only languages with 'dev' split
39
+ flores = flores[flores["flores_path"].apply(has_dev_split)]
40
  flores["bcp_47"] = flores["flores_path"].apply(
41
  lambda x: standardize_bcp47(x, macro=True),
42
  )
evals/datasets_/util.py CHANGED
@@ -63,3 +63,24 @@ def save(df: pd.DataFrame, fname: str):
63
  ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
64
  Path("results").mkdir(exist_ok=True)
65
  df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
64
  Path("results").mkdir(exist_ok=True)
65
  df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
66
+
67
+
68
+ def get_valid_task_languages(task_name: str) -> set:
69
+ """Return set of bcp_47 codes that have data available for the given task."""
70
+ from datasets_.flores import flores, splits
71
+ from datasets_.mmlu import tags_afrimmlu, tags_global_mmlu, tags_mmlu_autotranslated
72
+ from datasets_.arc import tags_uhura_arc_easy, tags_uhura_arc_easy_translated
73
+ from datasets_.truthfulqa import tags_uhura_truthfulqa
74
+ from datasets_.mgsm import tags_mgsm, tags_afrimgsm, tags_gsm8kx, tags_gsm_autotranslated
75
+
76
+ if task_name in ["translation_from", "translation_to", "classification"]:
77
+ return set(flores["bcp_47"])
78
+ elif task_name == "mmlu":
79
+ return set([*tags_afrimmlu.keys(), *tags_global_mmlu.keys(), *tags_mmlu_autotranslated.keys()])
80
+ elif task_name == "arc":
81
+ return set([*tags_uhura_arc_easy.keys(), *tags_uhura_arc_easy_translated.keys()])
82
+ elif task_name == "truthfulqa":
83
+ return set(tags_uhura_truthfulqa.keys())
84
+ elif task_name == "mgsm":
85
+ return set([*tags_mgsm.keys(), *tags_afrimgsm.keys(), *tags_gsm8kx.keys(), *tags_gsm_autotranslated.keys()])
86
+ return set()
evals/main.py CHANGED
@@ -9,12 +9,12 @@ from models import models
9
  from rich import print
10
  from tasks import tasks
11
  from tqdm.asyncio import tqdm_asyncio
12
- from datasets_.util import load, save
13
  from tqdm import tqdm
14
 
15
  n_sentences = int(environ.get("N_SENTENCES", 10))
16
- n_languages = int(environ.get("N_LANGUAGES", 300))
17
- n_models = int(environ.get("N_MODELS", 35))
18
 
19
  async def evaluate():
20
  start_time = time.time()
@@ -22,14 +22,17 @@ async def evaluate():
22
  # Pre-compute model tasks to avoid O(n²) lookups
23
  model_tasks = models.set_index("id")["tasks"].to_dict()
24
 
25
- # get all combinations that need evaluation
 
 
 
26
  combis = [
27
  (task_name, model, lang.bcp_47, i)
28
  for i in range(n_sentences)
29
  for lang in languages.head(n_languages).itertuples()
30
  for task_name, task in tasks.items()
31
  for model in models.iloc[:n_models]["id"]
32
- if task_name in model_tasks[model]
33
  ]
34
  combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
35
 
 
9
  from rich import print
10
  from tasks import tasks
11
  from tqdm.asyncio import tqdm_asyncio
12
+ from datasets_.util import load, save, get_valid_task_languages
13
  from tqdm import tqdm
14
 
15
  n_sentences = int(environ.get("N_SENTENCES", 10))
16
+ n_languages = int(environ.get("N_LANGUAGES", 1000))
17
+ n_models = int(environ.get("N_MODELS", 40))
18
 
19
  async def evaluate():
20
  start_time = time.time()
 
22
  # Pre-compute model tasks to avoid O(n²) lookups
23
  model_tasks = models.set_index("id")["tasks"].to_dict()
24
 
25
+ # Pre-compute valid languages for each task
26
+ valid_task_langs = {task_name: get_valid_task_languages(task_name) for task_name in tasks}
27
+
28
+ # get all combinations that need evaluation (filtering invalid lang×task combos)
29
  combis = [
30
  (task_name, model, lang.bcp_47, i)
31
  for i in range(n_sentences)
32
  for lang in languages.head(n_languages).itertuples()
33
  for task_name, task in tasks.items()
34
  for model in models.iloc[:n_models]["id"]
35
+ if task_name in model_tasks[model] and lang.bcp_47 in valid_task_langs[task_name]
36
  ]
37
  combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
38
 
frontend/public/index.html CHANGED
@@ -7,7 +7,7 @@
7
  <meta name="theme-color" content="#000000" />
8
  <meta
9
  name="description"
10
- content="AI Language Proficiency Monitor"
11
  />
12
  <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13
  <!--
@@ -24,7 +24,7 @@
24
  work correctly both with client-side routing and a non-root public URL.
25
  Learn how to configure a non-root public URL by running `npm run build`.
26
  -->
27
- <title>AI Language Proficiency Monitor</title>
28
  </head>
29
  <body>
30
  <noscript>You need to enable JavaScript to run this app.</noscript>
 
7
  <meta name="theme-color" content="#000000" />
8
  <meta
9
  name="description"
10
+ content="AI Language Benchmarks – model evaluations for every language in the world"
11
  />
12
  <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13
  <!--
 
24
  work correctly both with client-side routing and a non-root public URL.
25
  Learn how to configure a non-root public URL by running `npm run build`.
26
  -->
27
+ <title>AI Language Benchmarks</title>
28
  </head>
29
  <body>
30
  <noscript>You need to enable JavaScript to run this app.</noscript>
frontend/public/manifest.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "short_name": "React App",
3
- "name": "Create React App Sample",
4
  "icons": [
5
  {
6
  "src": "favicon.ico",
 
1
  {
2
+ "short_name": "languagebench",
3
+ "name": "AI Language Benchmarks",
4
  "icons": [
5
  {
6
  "src": "favicon.ico",
frontend/src/App.js CHANGED
@@ -237,7 +237,7 @@ function App () {
237
  letterSpacing: '-0.01em'
238
  }}
239
  >
240
- AI Language Proficiency Monitor
241
  </h1>
242
  <p
243
  style={{
@@ -249,7 +249,7 @@ function App () {
249
  lineHeight: '1.5'
250
  }}
251
  >
252
- Comprehensive multilingual evaluation results for AI language models
253
  </p>
254
 
255
  <div
@@ -449,8 +449,7 @@ function App () {
449
  >
450
  <div>
451
  <p>
452
- The <i>AI Language Proficiency Monitor</i> presents comprehensive
453
- multilingual evaluation results of AI language models.
454
  </p>
455
  <h4>Who is this for?</h4>
456
  <ul>
@@ -463,8 +462,7 @@ function App () {
463
  neglected languages.
464
  </li>
465
  <li>
466
- <b>Model developers</b> can compete on our{' '}
467
- <i>AI Language Proficiency</i> metric.
468
  </li>
469
  </ul>
470
  <h4>⚡ Live Updates</h4>
@@ -482,7 +480,7 @@ function App () {
482
  </p>
483
  <h4>Authors</h4>
484
  <p>
485
- The AI Language Proficiency Monitor is a collaboration between
486
  BMZ's{' '}
487
  <a
488
  href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'
 
237
  letterSpacing: '-0.01em'
238
  }}
239
  >
240
+ AI Language Benchmarks
241
  </h1>
242
  <p
243
  style={{
 
249
  lineHeight: '1.5'
250
  }}
251
  >
252
+ AI model evaluations for every language in the world
253
  </p>
254
 
255
  <div
 
449
  >
450
  <div>
451
  <p>
452
+ <i>languagebench</i> provides AI model evaluations for every language in the world.
 
453
  </p>
454
  <h4>Who is this for?</h4>
455
  <ul>
 
462
  neglected languages.
463
  </li>
464
  <li>
465
+ <b>Model developers</b> can compete on our benchmarks.
 
466
  </li>
467
  </ul>
468
  <h4>⚡ Live Updates</h4>
 
480
  </p>
481
  <h4>Authors</h4>
482
  <p>
483
+ languagebench is a collaboration between
484
  BMZ's{' '}
485
  <a
486
  href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'
frontend/src/components/CostPlot.js CHANGED
@@ -31,7 +31,7 @@ const CostPlot = ({ data, width = 750, height = 500 }) => {
31
  tickFormat: d => USDollar.format(d)
32
  },
33
  y: {
34
- label: 'Language Proficiency Score'
35
  },
36
  symbol: {
37
  legend: true
 
31
  tickFormat: d => USDollar.format(d)
32
  },
33
  y: {
34
+ label: 'Overall Score'
35
  },
36
  symbol: {
37
  legend: true
frontend/src/components/HistoryPlot.js CHANGED
@@ -26,7 +26,7 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
26
  tickFormat: '%Y-%m'
27
  },
28
  y: {
29
- label: 'Language Proficiency Score'
30
  },
31
  symbol: {
32
  legend: true
 
26
  tickFormat: '%Y-%m'
27
  },
28
  y: {
29
+ label: 'Overall Score'
30
  },
31
  symbol: {
32
  legend: true
frontend/src/components/LanguagePlot.js CHANGED
@@ -9,13 +9,13 @@ const LanguagePlot = ({ data, width = 750, height = 500 }) => {
9
  const plot = Plot.plot({
10
  width: width,
11
  height: height,
12
- subtitle: 'Proficiency scores by language',
13
  x: {
14
  label: 'Number of Speakers',
15
  type: 'log'
16
  },
17
  y: {
18
- label: 'Language proficiency score'
19
  },
20
  marks: [
21
  Plot.dot(languages, {
 
9
  const plot = Plot.plot({
10
  width: width,
11
  height: height,
12
+ subtitle: 'Overall scores by language',
13
  x: {
14
  label: 'Number of Speakers',
15
  type: 'log'
16
  },
17
  y: {
18
+ label: 'Overall score'
19
  },
20
  marks: [
21
  Plot.dot(languages, {
frontend/src/components/LanguageTierHistoryPlot.js CHANGED
@@ -76,7 +76,7 @@ const LanguageTierHistoryPlot = ({ data, width = 750, height = 500 }) => {
76
  tickFormat: '%Y-%m'
77
  },
78
  y: {
79
- label: 'Language Tier Proficiency Score'
80
  },
81
  color: {
82
  legend: true,
 
76
  tickFormat: '%Y-%m'
77
  },
78
  y: {
79
+ label: 'Overall Score by Language Tier'
80
  },
81
  color: {
82
  legend: true,
frontend/src/components/LicenseHistoryPlot.js CHANGED
@@ -65,7 +65,7 @@ const LicenseHistoryPlot = ({ data, width = 750, height = 500 }) => {
65
  tickFormat: '%Y-%m'
66
  },
67
  y: {
68
- label: 'Language Proficiency Score'
69
  },
70
  color: {
71
  legend: true,
 
65
  tickFormat: '%Y-%m'
66
  },
67
  y: {
68
+ label: 'Overall Score'
69
  },
70
  color: {
71
  legend: true,
frontend/src/components/ScoreColumns.js CHANGED
@@ -59,8 +59,8 @@ const createScoreColumn = (
59
  const ScoreColumns = (machineTranslatedMetrics = []) => [
60
  createScoreColumn(
61
  'average',
62
- 'Proficiency',
63
- 'Language Proficiency Score (average of the scores for each task)',
64
  0,
65
  1,
66
  machineTranslatedMetrics
 
59
  const ScoreColumns = (machineTranslatedMetrics = []) => [
60
  createScoreColumn(
61
  'average',
62
+ 'Overall',
63
+ 'Overall Score (average of the scores for each task)',
64
  0,
65
  1,
66
  machineTranslatedMetrics
frontend/src/components/WorldMap.js CHANGED
@@ -63,7 +63,7 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
63
  }).length
64
 
65
  const plot = Plot.plot({
66
- subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
67
  width: width,
68
  height: height,
69
  projection: 'equal-earth',
 
63
  }).length
64
 
65
  const plot = Plot.plot({
66
+ subtitle: `Overall Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
67
  width: width,
68
  height: height,
69
  projection: 'equal-earth',
notes/system-architecture-diagram.md CHANGED
@@ -1,4 +1,4 @@
1
- # AI Language Monitor - System Architecture
2
 
3
  \[AI-generated, not 100% up-to-date\]
4
 
@@ -155,7 +155,7 @@ flowchart TD
155
  - **languages.json**: Language information with population data
156
 
157
  ### 🟡 Frontend Visualization (Light Red)
158
- - **WorldMap**: Interactive country-level language proficiency visualization
159
  - **ModelTable**: Ranked model performance leaderboard with origin-specific columns
160
  - **LanguageTable**: Language coverage and speaker statistics
161
  - **DatasetTable**: Task-specific performance breakdowns with human/machine distinction
 
1
+ # languagebench - System Architecture
2
 
3
  \[AI-generated, not 100% up-to-date\]
4
 
 
155
  - **languages.json**: Language information with population data
156
 
157
  ### 🟡 Frontend Visualization (Light Red)
158
+ - **WorldMap**: Interactive country-level visualization
159
  - **ModelTable**: Ranked model performance leaderboard with origin-specific columns
160
  - **LanguageTable**: Language coverage and speaker statistics
161
  - **DatasetTable**: Task-specific performance breakdowns with human/machine distinction