Spaces:

fair-forward
/

languagebench

Running

App Files Files Community

davidpomerenke commited on 5 days ago

Commit

34b05c6

verified ·

1 Parent(s): aa92add

Upload from GitHub Actions: flores filter for available dev split

Browse files

Files changed (16) hide show

README.md +4 -4
evals/backend.py +2 -2
evals/datasets_/flores.py +9 -0
evals/datasets_/util.py +21 -0
evals/main.py +8 -5
frontend/public/index.html +2 -2
frontend/public/manifest.json +2 -2
frontend/src/App.js +5 -7
frontend/src/components/CostPlot.js +1 -1
frontend/src/components/HistoryPlot.js +1 -1
frontend/src/components/LanguagePlot.js +2 -2
frontend/src/components/LanguageTierHistoryPlot.js +1 -1
frontend/src/components/LicenseHistoryPlot.js +1 -1
frontend/src/components/ScoreColumns.js +2 -2
frontend/src/components/WorldMap.js +1 -1
notes/system-architecture-diagram.md +2 -2

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: AI Language Monitor
 emoji: 🌍
 colorFrom: purple
 colorTo: pink
 sdk: docker
 app_port: 8000
 license: cc-by-sa-4.0
-short_description: Evaluating LLM performance across all human languages.
 datasets:
 - openlanguagedata/flores_plus
 - google/fleurs
@@ -39,9 +39,9 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
 [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-purple)](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
-# AI Language Monitor 🌍
-_Tracking language proficiency of AI models for every language_
 ## Evaluate

 ---
+title: languagebench
 emoji: 🌍
 colorFrom: purple
 colorTo: pink
 sdk: docker
 app_port: 8000
 license: cc-by-sa-4.0
+short_description: AI model evaluations for every language in the world.
 datasets:
 - openlanguagedata/flores_plus
 - google/fleurs
 [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-purple)](https://huggingface.co/spaces/datenlabor-bmz/ai-language-monitor)
+# languagebench 🌍
+_AI model evaluations for every language in the world_
 ## Evaluate

evals/backend.py CHANGED Viewed

@@ -267,7 +267,7 @@ def make_language_tier_history(scores_df, languages, models):
     )
     tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
-    # Calculate model-language proficiency scores
     scores_df = scores_df.copy()
     scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
     pivot = scores_df.pivot_table(
@@ -315,7 +315,7 @@ def make_license_history(scores_df, models):
     scores_df = scores_df.copy()
     scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
-    # Pivot and compute proficiency
     pivot = scores_df.pivot_table(
         index="model", columns="task_metric", values="score", aggfunc="mean"
     )

     )
     tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
+    # Calculate model-language overall scores
     scores_df = scores_df.copy()
     scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
     pivot = scores_df.pivot_table(
     scores_df = scores_df.copy()
     scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
+    # Pivot and compute overall score
     pivot = scores_df.pivot_table(
         index="model", columns="task_metric", values="score", aggfunc="mean"
     )

evals/datasets_/flores.py CHANGED Viewed

@@ -27,7 +27,16 @@ def aggregate_flores_paths(flores_paths):
     return flores_paths.values[populations.index(max(populations))]
 flores = pd.DataFrame(splits, columns=["flores_path"])
 flores["bcp_47"] = flores["flores_path"].apply(
     lambda x: standardize_bcp47(x, macro=True),
 )

     return flores_paths.values[populations.index(max(populations))]
+def has_dev_split(flores_path):
+    try:
+        _load_dataset(slug, subset=flores_path, split="dev")
+        return True
+    except (ValueError, FileNotFoundError):
+        return False
 flores = pd.DataFrame(splits, columns=["flores_path"])
+# Filter to only languages with 'dev' split
+flores = flores[flores["flores_path"].apply(has_dev_split)]
 flores["bcp_47"] = flores["flores_path"].apply(
     lambda x: standardize_bcp47(x, macro=True),
 )

evals/datasets_/util.py CHANGED Viewed

@@ -63,3 +63,24 @@ def save(df: pd.DataFrame, fname: str):
     ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
     Path("results").mkdir(exist_ok=True)
     df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)

     ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
     Path("results").mkdir(exist_ok=True)
     df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
+def get_valid_task_languages(task_name: str) -> set:
+    """Return set of bcp_47 codes that have data available for the given task."""
+    from datasets_.flores import flores, splits
+    from datasets_.mmlu import tags_afrimmlu, tags_global_mmlu, tags_mmlu_autotranslated
+    from datasets_.arc import tags_uhura_arc_easy, tags_uhura_arc_easy_translated
+    from datasets_.truthfulqa import tags_uhura_truthfulqa
+    from datasets_.mgsm import tags_mgsm, tags_afrimgsm, tags_gsm8kx, tags_gsm_autotranslated
+    if task_name in ["translation_from", "translation_to", "classification"]:
+        return set(flores["bcp_47"])
+    elif task_name == "mmlu":
+        return set([*tags_afrimmlu.keys(), *tags_global_mmlu.keys(), *tags_mmlu_autotranslated.keys()])
+    elif task_name == "arc":
+        return set([*tags_uhura_arc_easy.keys(), *tags_uhura_arc_easy_translated.keys()])
+    elif task_name == "truthfulqa":
+        return set(tags_uhura_truthfulqa.keys())
+    elif task_name == "mgsm":
+        return set([*tags_mgsm.keys(), *tags_afrimgsm.keys(), *tags_gsm8kx.keys(), *tags_gsm_autotranslated.keys()])
+    return set()

evals/main.py CHANGED Viewed

@@ -9,12 +9,12 @@ from models import models
 from rich import print
 from tasks import tasks
 from tqdm.asyncio import tqdm_asyncio
-from datasets_.util import load, save
 from tqdm import tqdm
 n_sentences = int(environ.get("N_SENTENCES", 10))
-n_languages = int(environ.get("N_LANGUAGES", 300))
-n_models = int(environ.get("N_MODELS", 35))
 async def evaluate():
     start_time = time.time()
@@ -22,14 +22,17 @@ async def evaluate():
     # Pre-compute model tasks to avoid O(n²) lookups
     model_tasks = models.set_index("id")["tasks"].to_dict()
-    # get all combinations that need evaluation
     combis = [
         (task_name, model, lang.bcp_47, i)
         for i in range(n_sentences)
         for lang in languages.head(n_languages).itertuples()
         for task_name, task in tasks.items()
         for model in models.iloc[:n_models]["id"]
-        if task_name in model_tasks[model]
     ]
     combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])

 from rich import print
 from tasks import tasks
 from tqdm.asyncio import tqdm_asyncio
+from datasets_.util import load, save, get_valid_task_languages
 from tqdm import tqdm
 n_sentences = int(environ.get("N_SENTENCES", 10))
+n_languages = int(environ.get("N_LANGUAGES", 1000))
+n_models = int(environ.get("N_MODELS", 40))
 async def evaluate():
     start_time = time.time()
     # Pre-compute model tasks to avoid O(n²) lookups
     model_tasks = models.set_index("id")["tasks"].to_dict()
+    # Pre-compute valid languages for each task
+    valid_task_langs = {task_name: get_valid_task_languages(task_name) for task_name in tasks}
+    # get all combinations that need evaluation (filtering invalid lang×task combos)
     combis = [
         (task_name, model, lang.bcp_47, i)
         for i in range(n_sentences)
         for lang in languages.head(n_languages).itertuples()
         for task_name, task in tasks.items()
         for model in models.iloc[:n_models]["id"]
+        if task_name in model_tasks[model] and lang.bcp_47 in valid_task_langs[task_name]
     ]
     combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])

frontend/public/index.html CHANGED Viewed

@@ -7,7 +7,7 @@
     <meta name="theme-color" content="#000000" />
     <meta
       name="description"
-      content="AI Language Proficiency Monitor"
     />
     <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
     <!--
@@ -24,7 +24,7 @@
       work correctly both with client-side routing and a non-root public URL.
       Learn how to configure a non-root public URL by running `npm run build`.
     -->
-    <title>AI Language Proficiency Monitor</title>
   </head>
   <body>
     <noscript>You need to enable JavaScript to run this app.</noscript>

     <meta name="theme-color" content="#000000" />
     <meta
       name="description"
+      content="AI Language Benchmarks – model evaluations for every language in the world"
     />
     <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
     <!--
       work correctly both with client-side routing and a non-root public URL.
       Learn how to configure a non-root public URL by running `npm run build`.
     -->
+    <title>AI Language Benchmarks</title>
   </head>
   <body>
     <noscript>You need to enable JavaScript to run this app.</noscript>

frontend/public/manifest.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "short_name": "React App",
-  "name": "Create React App Sample",
   "icons": [
     {
       "src": "favicon.ico",

 {
+  "short_name": "languagebench",
+  "name": "AI Language Benchmarks",
   "icons": [
     {
       "src": "favicon.ico",

frontend/src/App.js CHANGED Viewed

@@ -237,7 +237,7 @@ function App () {
               letterSpacing: '-0.01em'
             }}
           >
-            AI Language Proficiency Monitor
           </h1>
           <p
             style={{
@@ -249,7 +249,7 @@ function App () {
               lineHeight: '1.5'
             }}
           >
-            Comprehensive multilingual evaluation results for AI language models
           </p>
           <div
@@ -449,8 +449,7 @@ function App () {
         >
           <div>
             <p>
-              The <i>AI Language Proficiency Monitor</i> presents comprehensive
-              multilingual evaluation results of AI language models.
             </p>
             <h4>Who is this for?</h4>
             <ul>
@@ -463,8 +462,7 @@ function App () {
                 neglected languages.
               </li>
               <li>
-                <b>Model developers</b> can compete on our{' '}
-                <i>AI Language Proficiency</i> metric.
               </li>
             </ul>
             <h4>⚡ Live Updates</h4>
@@ -482,7 +480,7 @@ function App () {
             </p>
             <h4>Authors</h4>
             <p>
-              The AI Language Proficiency Monitor is a collaboration between
               BMZ's{' '}
               <a
                 href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'

               letterSpacing: '-0.01em'
             }}
           >
+            AI Language Benchmarks
           </h1>
           <p
             style={{
               lineHeight: '1.5'
             }}
           >
+            AI model evaluations for every language in the world
           </p>
           <div
         >
           <div>
             <p>
+              <i>languagebench</i> provides AI model evaluations for every language in the world.
             </p>
             <h4>Who is this for?</h4>
             <ul>
                 neglected languages.
               </li>
               <li>
+                <b>Model developers</b> can compete on our benchmarks.
               </li>
             </ul>
             <h4>⚡ Live Updates</h4>
             </p>
             <h4>Authors</h4>
             <p>
+              languagebench is a collaboration between
               BMZ's{' '}
               <a
                 href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'

frontend/src/components/CostPlot.js CHANGED Viewed

@@ -31,7 +31,7 @@ const CostPlot = ({ data, width = 750, height = 500 }) => {
         tickFormat: d => USDollar.format(d)
       },
       y: {
-        label: 'Language Proficiency Score'
       },
       symbol: {
         legend: true

         tickFormat: d => USDollar.format(d)
       },
       y: {
+        label: 'Overall Score'
       },
       symbol: {
         legend: true

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -26,7 +26,7 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
         tickFormat: '%Y-%m'
       },
       y: {
-        label: 'Language Proficiency Score'
       },
       symbol: {
         legend: true

         tickFormat: '%Y-%m'
       },
       y: {
+        label: 'Overall Score'
       },
       symbol: {
         legend: true

frontend/src/components/LanguagePlot.js CHANGED Viewed

@@ -9,13 +9,13 @@ const LanguagePlot = ({ data, width = 750, height = 500 }) => {
     const plot = Plot.plot({
       width: width,
       height: height,
-      subtitle: 'Proficiency scores by language',
       x: {
         label: 'Number of Speakers',
         type: 'log'
       },
       y: {
-        label: 'Language proficiency score'
       },
       marks: [
         Plot.dot(languages, {

     const plot = Plot.plot({
       width: width,
       height: height,
+      subtitle: 'Overall scores by language',
       x: {
         label: 'Number of Speakers',
         type: 'log'
       },
       y: {
+        label: 'Overall score'
       },
       marks: [
         Plot.dot(languages, {

frontend/src/components/LanguageTierHistoryPlot.js CHANGED Viewed

@@ -76,7 +76,7 @@ const LanguageTierHistoryPlot = ({ data, width = 750, height = 500 }) => {
         tickFormat: '%Y-%m'
       },
       y: {
-        label: 'Language Tier Proficiency Score'
       },
       color: {
         legend: true,

         tickFormat: '%Y-%m'
       },
       y: {
+        label: 'Overall Score by Language Tier'
       },
       color: {
         legend: true,

frontend/src/components/LicenseHistoryPlot.js CHANGED Viewed

@@ -65,7 +65,7 @@ const LicenseHistoryPlot = ({ data, width = 750, height = 500 }) => {
         tickFormat: '%Y-%m'
       },
       y: {
-        label: 'Language Proficiency Score'
       },
       color: {
         legend: true,

         tickFormat: '%Y-%m'
       },
       y: {
+        label: 'Overall Score'
       },
       color: {
         legend: true,

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -59,8 +59,8 @@ const createScoreColumn = (
 const ScoreColumns = (machineTranslatedMetrics = []) => [
   createScoreColumn(
     'average',
-    'Proficiency',
-    'Language Proficiency Score (average of the scores for each task)',
     0,
     1,
     machineTranslatedMetrics

 const ScoreColumns = (machineTranslatedMetrics = []) => [
   createScoreColumn(
     'average',
+    'Overall',
+    'Overall Score (average of the scores for each task)',
     0,
     1,
     machineTranslatedMetrics

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -63,7 +63,7 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
     }).length
     const plot = Plot.plot({
-      subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
       width: width,
       height: height,
       projection: 'equal-earth',

     }).length
     const plot = Plot.plot({
+      subtitle: `Overall Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
       width: width,
       height: height,
       projection: 'equal-earth',

notes/system-architecture-diagram.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# AI Language Monitor - System Architecture
 \[AI-generated, not 100% up-to-date\]
@@ -155,7 +155,7 @@ flowchart TD
 - **languages.json**: Language information with population data
 ### 🟡 Frontend Visualization (Light Red)
-- **WorldMap**: Interactive country-level language proficiency visualization
 - **ModelTable**: Ranked model performance leaderboard with origin-specific columns
 - **LanguageTable**: Language coverage and speaker statistics
 - **DatasetTable**: Task-specific performance breakdowns with human/machine distinction

+# languagebench - System Architecture
 \[AI-generated, not 100% up-to-date\]
 - **languages.json**: Language information with population data
 ### 🟡 Frontend Visualization (Light Red)
+- **WorldMap**: Interactive country-level visualization
 - **ModelTable**: Ranked model performance leaderboard with origin-specific columns
 - **LanguageTable**: Language coverage and speaker statistics
 - **DatasetTable**: Task-specific performance breakdowns with human/machine distinction