Spaces:

fair-forward
/

languagebench

Running

App Files Files Community

davidpomerenke commited on Sep 1

Commit

c790fdb

verified ·

1 Parent(s): f88768f

Upload from GitHub Actions: Add auto-translated datasets

Browse files

Files changed (29) hide show

.DS_Store +0 -0
.github/workflows/nightly-evals.yml +0 -4
.gitignore +0 -3
Dockerfile +1 -1
README.md +0 -135
datasets.json +6 -6
evals/backend.py +39 -138
evals/countries.py +4 -9
evals/datasets_/arc.py +19 -33
evals/datasets_/mgsm.py +24 -36
evals/datasets_/mmlu.py +23 -45
evals/datasets_/truthfulqa.py +26 -53
evals/datasets_/util.py +0 -7
evals/main.py +51 -161
evals/models.py +44 -146
evals/tasks.py +168 -160
frontend/src/App.js +77 -183
frontend/src/components/HistoryPlot.js +2 -2
frontend/src/components/LanguageTable.js +1 -1
frontend/src/components/ModelTable.js +17 -31
frontend/src/components/ScoreColumns.js +10 -23
frontend/src/components/ScoreField.js +1 -2
frontend/src/components/SpeakerPlot.js +2 -2
frontend/src/components/WorldMap.js +7 -22
languages.json +49 -49
models.json +226 -432
pyproject.toml +0 -10
results.json +2 -2
uv.lock +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.github/workflows/nightly-evals.yml CHANGED Viewed

@@ -8,8 +8,6 @@ on:
 jobs:
   run-evals:
     runs-on: ubuntu-latest
-    # checking if this is working in case eval runs take longer than 6h github actions allowance
-    timeout-minutes: 1440  # 24 hours timeout
     steps:
       - uses: actions/checkout@v3
@@ -27,8 +25,6 @@ jobs:
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
-          N_SENTENCES: 20
-          MAX_LANGUAGES: 150
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

 jobs:
   run-evals:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

.gitignore CHANGED Viewed

@@ -20,6 +20,3 @@ wheels/
 # folders and files to be ignored
 .specstory/
 .cursorindexingignore
-# Project-specific files
-.dockerignore.eval

 # folders and files to be ignored
 .specstory/
 .cursorindexingignore

Dockerfile CHANGED Viewed

@@ -14,7 +14,7 @@ ENV HOME=/home/user \
 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
-COPY --chown=user pyproject.toml uv.lock README.md ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
+COPY --chown=user pyproject.toml uv.lock ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

README.md CHANGED Viewed

@@ -43,147 +43,12 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
 _Tracking language proficiency of AI models for every language_
-## System Architecture
-The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
-```mermaid
-flowchart TD
-    %% Model Sources
-    A1["important_models<br/>Static Curated List"] --> D[load_models]
-    A2["get_historical_popular_models<br/>Web Scraping - Top 20"] --> D
-    A3["get_current_popular_models<br/>Web Scraping - Top 10"] --> D
-    A4["blocklist<br/>Exclusions"] --> D
-    %% Model Processing
-    D --> |"Combine & Dedupe"| E["Dynamic Model List<br/>~40-50 models"]
-    E --> |get_or_metadata| F["OpenRouter API<br/>Model Metadata"]
-    F --> |get_hf_metadata| G["HuggingFace API<br/>Model Details"]
-    G --> H["Enriched Model DataFrame"]
-    H --> |Save| I[models.json]
-    %% Model Validation & Cost Filtering
-    H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost ≤ $20/1M tokens"]
-    H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
-    %% Language Data
-    J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
-    %% Task Registry with Unified Prompting
-    L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
-    M --> M1["translation_from/to<br/>BLEU + ChrF"]
-    M --> M2["classification<br/>Accuracy"]
-    M --> M3["mmlu<br/>Accuracy"]
-    M --> M4["arc<br/>Accuracy"]
-    M --> M5["truthfulqa<br/>Accuracy"]
-    M --> M6["mgsm<br/>Accuracy"]
-    %% On-the-fly Translation with Origin Tagging
-    subgraph OTF [On-the-fly Dataset Translation]
-        direction LR
-        DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
-        Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
-        DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
-    end
-    %% Evaluation Pipeline
-    H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
-    K --> |"languages bcp_47"| N
-    L --> |"tasks.items"| N
-    N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model × Language × Task"]
-    O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
-    %% Task Execution with Origin Tracking
-    P --> Q1[translate_and_evaluate<br/>Origin: 'human']
-    P --> Q2[classify_and_evaluate<br/>Origin: 'human']
-    P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
-    P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
-    P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
-    P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
-    %% API Calls with Error Handling
-    Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
-    Q2 --> |"complete() API<br/>Rate Limiting"| R
-    Q3 --> |"complete() API<br/>Rate Limiting"| R
-    Q4 --> |"complete() API<br/>Rate Limiting"| R
-    Q5 --> |"complete() API<br/>Rate Limiting"| R
-    Q6 --> |"complete() API<br/>Rate Limiting"| R
-    %% Results Processing with Origin Aggregation
-    R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
-    S --> |Save| T[results.json]
-    %% Backend & Frontend with Origin-Specific Metrics
-    T --> |Read| U[backend.py]
-    I --> |Read| U
-    U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
-    U --> |make_country_table| W["Country Aggregation"]
-    U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
-    X --> |"JSON Response"| Y["Frontend React App"]
-    %% UI Components
-    Y --> Z1["WorldMap.js<br/>Country Visualization"]
-    Y --> Z2["ModelTable.js<br/>Model Rankings"]
-    Y --> Z3["LanguageTable.js<br/>Language Coverage"]
-    Y --> Z4["DatasetTable.js<br/>Task Performance"]
-    %% Data Sources with Origin Information
-    subgraph DS ["Data Sources"]
-        DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
-        DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
-        DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
-        DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
-        DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
-    end
-    DS1 --> Q1
-    DS2 --> Q3
-    DS3 --> Q4
-    DS4 --> Q5
-    DS5 --> Q6
-    DS_translated --> Q3
-    DS_translated --> Q4
-    DS_translated --> Q5
-    DS_native --> Q3
-    DS_native --> Q4
-    DS_native --> Q5
-    %% Styling - Neutral colors that work in both dark and light modes
-    classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
-    classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
-    classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
-    classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
-    classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
-    classDef translation fill:#d4edda,stroke:#155724,color:#155724
-    class A1,A2,A3,A4 modelSource
-    class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
-    class R,F,G,X api
-    class T,I storage
-    class Y,Z1,Z2,Z3,Z4 frontend
-    class Google_Translate,DS_translated,DS_native translation
-```
-**Key Features:**
-- **Model Discovery**: Combines curated models with real-time trending models via web scraping
-- **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
-- **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
-- **Real-time Visualization**: Interactive web interface with country-level insights
 ## Evaluate
-### Local Development
 ```bash
 uv run --extra dev evals/main.py
 ```
-### Google Cloud Deployment
-```bash
-uv run --extra dev evals/main_gcs.py
-```
 ## Explore
 ```bash

 _Tracking language proficiency of AI models for every language_
 ## Evaluate
 ```bash
 uv run --extra dev evals/main.py
 ```
 ## Explore
 ```bash

datasets.json CHANGED Viewed

@@ -219,7 +219,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
@@ -256,7 +256,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
@@ -360,7 +360,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
@@ -375,7 +375,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
@@ -420,7 +420,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {
@@ -435,7 +435,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {

         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {

evals/backend.py CHANGED Viewed

@@ -4,18 +4,7 @@ import os
 import numpy as np
 import pandas as pd
 import uvicorn
-# Robust import so this file works both as a package module and as a script
-try:
-    # When executed as a package module (recommended): `python -m uvicorn evals.backend:app`
-    from .countries import make_country_table
-except Exception:
-    try:
-        # When executed from project root with package path available
-        from evals.countries import make_country_table
-    except Exception:
-        # When executed directly from evals/ directory
-        from countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
@@ -37,7 +26,7 @@ task_metrics = [
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
-    "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
@@ -56,145 +45,66 @@ def compute_normalized_average(df, metrics):
     return normalized_df.mean(axis=1, skipna=False)
-def make_model_table(scores_df, models):
-    # Create a combined task_metric for origin
-    scores_df["task_metric_origin"] = (
-        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
-    )
-    # Pivot to get scores for each origin-specific metric
-    scores_pivot = scores_df.pivot_table(
-        index="model",
-        columns="task_metric_origin",
-        values="score",
-        aggfunc="mean",
-    )
-    # Create the regular task_metric for the main average calculation
-    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
-    main_pivot = scores_df.pivot_table(
-        index="model", columns="task_metric", values="score", aggfunc="mean"
     )
-    # Merge the two pivots
-    df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
-    # Compute origin presence per model+metric
-    origin_presence = (
-        scores_df.groupby(["model", "task_metric", "origin"]).size().unstack(fill_value=0)
-    )
-    # Add boolean flags: show asterisk only if exclusively machine-origin contributed
-    for metric in task_metrics:
-        human_col_name = "human" if "human" in origin_presence.columns else None
-        machine_col_name = "machine" if "machine" in origin_presence.columns else None
-        if human_col_name or machine_col_name:
-            flags = []
-            for model in df.index:
-                try:
-                    counts = origin_presence.loc[(model, metric)]
-                except KeyError:
-                    flags.append(False)
-                    continue
-                human_count = counts.get(human_col_name, 0) if human_col_name else 0
-                machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
-                flags.append(machine_count > 0 and human_count == 0)
-            df[f"{metric}_is_machine"] = flags
-        else:
-            df[f"{metric}_is_machine"] = False
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
-    # Dynamically find all metric columns to include
-    final_cols = df.columns
-    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
-            "rank", "model", "name", "provider_name", "hf_id", "creation_date",
-            "size", "type", "license", "cost", "average",
-            *sorted(list(set(metric_cols)))
         ]
     ]
     return df
-def make_language_table(scores_df, languages):
-    # Create a combined task_metric for origin
-    scores_df["task_metric_origin"] = (
-        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
-    )
-    # Pivot to get scores for each origin-specific metric
-    scores_pivot = scores_df.pivot_table(
-        index="bcp_47",
-        columns="task_metric_origin",
-        values="score",
-        aggfunc="mean",
-    )
-    # Create the regular task_metric for the main average calculation
-    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
-    main_pivot = scores_df.pivot_table(
-        index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
     )
-    # Merge the two pivots
-    df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
-    # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
-    origin_presence = (
-        scores_df.groupby(["bcp_47", "task_metric", "origin"]).size().unstack(fill_value=0)
-    )
-    for metric in task_metrics:
-        human_col_name = "human" if "human" in origin_presence.columns else None
-        machine_col_name = "machine" if "machine" in origin_presence.columns else None
-        if human_col_name or machine_col_name:
-            flags = []
-            for bcp in df.index:
-                try:
-                    counts = origin_presence.loc[(bcp, metric)]
-                except KeyError:
-                    flags.append(False)
-                    continue
-                human_count = counts.get(human_col_name, 0) if human_col_name else 0
-                machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
-                flags.append(machine_count > 0 and human_count == 0)
-            df[f"{metric}_is_machine"] = flags
-        else:
-            df[f"{metric}_is_machine"] = False
-    # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
-    for metric in task_metrics:
-        machine_col = f"{metric}_machine"
-        if machine_col in df.columns:
-            df[f"{metric}_is_machine"] = df[machine_col].notna()
-        else:
-            df[f"{metric}_is_machine"] = False
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
-    # Dynamically find all metric columns to include
-    final_cols = df.columns
-    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
-            "bcp_47", "language_name", "autonym", "speakers", "family",
-            "average", "in_benchmark",
-            *sorted(list(set(metric_cols)))
         ]
     ]
     return df
@@ -215,18 +125,10 @@ async def data(request: Request):
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
-    df = scores.groupby(["model", "bcp_47", "task", "metric", "origin"]).mean().reset_index()
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
-    # Identify which metrics have machine translations available
-    machine_translated_metrics = set()
-    for _, row in df.iterrows():
-        if row["origin"] == "machine":
-            metric_name = f"{row['task']}_{row['metric']}"
-            machine_translated_metrics.add(metric_name)
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -241,7 +143,6 @@ async def data(request: Request):
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
-        "machine_translated_metrics": list(machine_translated_metrics),
     }
     return JSONResponse(content=all_tables)

 import numpy as np
 import pandas as pd
 import uvicorn
+from countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
+    # "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
     return normalized_df.mean(axis=1, skipna=False)
+def make_model_table(df, models):
+    df = (
+        df.groupby(["model", "task", "metric"])
+        .agg({"score": "mean", "bcp_47": "nunique"})
+        .reset_index()
     )
+    df["task_metric"] = df["task"] + "_" + df["metric"]
+    df = df.drop(columns=["task", "metric"])
+    df = df.pivot(index="model", columns="task_metric", values="score")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
     df = df[
         [
+            "rank",
+            "model",
+            "name",
+            "provider_name",
+            "hf_id",
+            "creation_date",
+            "size",
+            "type",
+            "license",
+            "cost",
+            "average",
+            *task_metrics,
         ]
     ]
     return df
+def make_language_table(df, languages):
+    df = (
+        df.groupby(["bcp_47", "task", "metric"])
+        .agg({"score": "mean", "model": "nunique"})
+        .reset_index()
     )
+    df["task_metric"] = df["task"] + "_" + df["metric"]
+    df = df.drop(columns=["task", "metric"])
+    df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[
         [
+            "bcp_47",
+            "language_name",
+            "autonym",
+            "speakers",
+            "family",
+            "average",
+            "in_benchmark",
+            *task_metrics,
         ]
     ]
     return df
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
+    df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
     }
     return JSONResponse(content=all_tables)

evals/countries.py CHANGED Viewed

@@ -30,15 +30,10 @@ def make_country_table(language_table):
             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
-        if speaker_pop < 1000:  # 🎯 Grey out low-population countries
-            score = None  # This will make them appear grey on the map
-        else:
-            score = (
-                sum(entry["score"] * entry["population"] for entry in languages)
-                / speaker_pop
-            )
         countries[country] = {
             "score": score,
             "languages": languages,

             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
+        score = (
+            sum(entry["score"] * entry["population"] for entry in languages)
+            / speaker_pop
+        )
         countries[country] = {
             "score": score,
             "languages": languages,

evals/datasets_/arc.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import random
-from langcodes import standardize_tag
 from rich import print
-from models import translate_google, get_google_supported_languages
 from tqdm import tqdm
-from datasets import load_dataset, Dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
@@ -13,33 +14,27 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
-    standardize_tag(a.split("_")[0], macro=True): a
-    for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
-id_sets_train = [
-    set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
-    for tag in tags_uhura_arc_easy.values()
-]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
-id_sets_test = [
-    set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
-    for tag in tags_uhura_arc_easy.values()
-]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
-    standardize_tag(a.split("_")[0], macro=True): a
-    for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
@@ -50,36 +45,27 @@ def load_uhura_arc_easy(language_bcp_47, nr):
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "masakhane/uhura-arc-easy", task, "human"
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
-        ds = _load_dataset(
-            slug_uhura_arc_easy_translated,
-            tags_uhura_arc_easy_translated[language_bcp_47],
-        )
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "fair-forward/arc-easy-autotranslated", task, "machine"
     else:
         return None, None, None
-def load_uhura_arc_challenge(language_bcp_47, nr):
-    ds_name = "jlahd/uhura_arc_challenge"
-    if language_bcp_47 in _get_dataset_config_names(ds_name):
-        ds = _load_dataset(ds_name, language_bcp_47)
-        task = ds["test"][nr]
-        return ds_name, task
-    else:
-        return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     n_samples = 10
     train_ids = common_ids_train[:n_samples+3]

 import random
+from collections import Counter, defaultdict
+from langcodes import Language, standardize_tag
 from rich import print
+from models import translate_google, google_supported_languages
 from tqdm import tqdm
+from datasets import Dataset, load_dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
+id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
+id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
+        train_ids = common_ids_train[nr:nr+3]
+        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "masakhane/uhura-arc-easy", examples, task
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
+        ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
         ds = ds.rename_column("answerKey", "answer")
+        train_ids = common_ids_train[nr:nr+3]
+        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
+        # raise Exception(language_bcp_47)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "fair-forward/arc-easy-autotranslated", examples, task
     else:
         return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
     n_samples = 10
     train_ids = common_ids_train[:n_samples+3]

evals/datasets_/mgsm.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import asyncio
 import os
-import random
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset, cache
-from langcodes import Language, standardize_tag
-from models import get_google_supported_languages, translate_google
-from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -39,41 +37,31 @@ def parse_number(i):
         return None
-@cache
-def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
-    """Cache individual MGSM items efficiently"""
-    try:
-        ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
-        if nr >= len(ds):
-            return None
-        row = ds[nr]
-        # Post-process based on dataset type
-        if dataset_slug == slug_gsm8kx:
-            row["answer_number"] = row["answer"].split("####")[1].strip()
-        return row
-    except Exception:
-        # Dataset doesn't exist or doesn't have test split
-        return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
-        item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
-        return slug_mgsm, item, "human" if item else (None, None, None)
     elif language_bcp_47 in tags_afrimgsm.keys():
-        item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
-        return slug_afrimgsm, item, "human" if item else (None, None, None)
-    elif language_bcp_47 in tags_gsm8kx.keys():
-        item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
-        return slug_gsm8kx, item, "machine" if item else (None, None, None)
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
-        item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
-        return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
     else:
-        return None, None, None
 def translate_mgsm(languages):
@@ -81,7 +69,7 @@ def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"

 import asyncio
 import os
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset
+from langcodes import standardize_tag
+from models import google_supported_languages, translate_google
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
         return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
+        ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
+        return slug_mgsm, ds[nr]
     elif language_bcp_47 in tags_afrimgsm.keys():
+        ds = _load_dataset(
+            slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
+        )
+        return slug_afrimgsm, ds[nr]
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
+        ds = _load_dataset(
+            slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
+        )
+        return slug_gsm_autotranslated, ds[nr]
+    elif language_bcp_47 in tags_gsm8kx.keys():
+        row = _load_dataset(
+            slug_gsm8kx,
+            subset=tags_gsm8kx[language_bcp_47],
+            split="test",
+            trust_remote_code=True,
+        )[nr]
+        row["answer_number"] = row["answer"].split("####")[1].strip()
+        return slug_gsm8kx, row
     else:
+        return None, None
 def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"

evals/datasets_/mmlu.py CHANGED Viewed

@@ -4,9 +4,9 @@ import random
 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset, cache
 from langcodes import Language, standardize_tag
-from models import get_google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -111,7 +111,6 @@ def print_datasets_analysis():
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
@@ -144,51 +143,32 @@ tags_mmlux = set(
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
-tags_mmlu_autotranslated = {
-    standardize_tag(a, macro=True): a
-    for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
-}
 categories = sorted(
         list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
     )
-@cache
-def _get_processed_mmlu_dataset(dataset_name, subset_tag):
-    """Cache processed datasets to avoid reprocessing"""
-    ds = _load_dataset(dataset_name, subset_tag)
-    if dataset_name == "masakhane/afrimmlu":
-        ds = ds.map(parse_choices)
-    elif dataset_name == "CohereForAI/Global-MMLU":
-        ds = ds.map(add_choices)
-    return ds
-@cache
-def _get_mmlu_item(dataset_name, subset_tag, category, nr):
-    """Cache individual MMLU items efficiently"""
-    ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
-    if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
-        filtered = ds["test"].filter(lambda x: x["subject"] == category)
-        return filtered[nr] if nr < len(filtered) else None
-    else:  # fair-forward/mmlu-autotranslated
-        filtered = ds["test"].filter(lambda x: x["subject"] == category)
-        return filtered[nr] if nr < len(filtered) else None
-async def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
-        task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
-        return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
     elif language_bcp_47 in tags_global_mmlu.keys():
-        task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
-        return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
-    # TODO: add in Okapi, MMLUX @Jonas
     elif language_bcp_47 in tags_mmlu_autotranslated:
-        task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
-        return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
     else:
         return None, None, None
@@ -197,10 +177,10 @@ def translate_mmlu(languages):
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:150]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
-    n_samples = 20
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
@@ -216,10 +196,8 @@ def translate_mmlu(languages):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
-                        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
-                        filtered = ds.filter(lambda x: x["subject"] == category)
-                        for i in range(min(n_samples, len(filtered))):
-                            task = filtered[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset
 from langcodes import Language, standardize_tag
+from models import google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
+tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 categories = sorted(
         list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
     )
+def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
+        ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
+        ds = ds.map(parse_choices)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "masakhane/afrimmlu", examples, task
     elif language_bcp_47 in tags_global_mmlu.keys():
+        ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
+        ds = ds.map(add_choices)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "CohereForAI/Global-MMLU", examples, task
     elif language_bcp_47 in tags_mmlu_autotranslated:
+        ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "fair-forward/mmlu-autotranslated", examples, task
     else:
         return None, None, None
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
+    n_samples = 10
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
+                        for i in range(n_samples):
+                            task = ds.filter(lambda x: x["subject"] == category)[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

evals/datasets_/truthfulqa.py CHANGED Viewed

@@ -9,26 +9,16 @@ from tqdm.asyncio import tqdm_asyncio
 import os
 from datasets import Dataset, load_dataset
-from models import translate_google, get_google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
-slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
 tags_uhura_truthfulqa = {
     standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
-# Get available auto-translated languages
-try:
-    tags_truthfulqa_autotranslated = {
-        standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
-    }
-except Exception:
-    tags_truthfulqa_autotranslated = {}
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
@@ -36,36 +26,27 @@ def add_choices(row):
     return row
-async def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
-        ds = _load_dataset(
-            slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
-        )
         ds = ds.map(add_choices)
         task = ds["test"][nr]
-        return "masakhane/uhura-truthfulqa", task, "human"
-    elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
-        # Load from auto-translated dataset (same samples as translation)
-        ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
-        test_split = ds["test"] if "test" in ds else ds
-        task = test_split[nr]
-        return slug_truthfulqa_autotranslated, task, "machine"
-    # TODO: add Okapi, TruthfulQA-X @Jonas
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:150]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
-    n_samples = 20
-    # Set fixed seed for consistent sample selection across all languages
-    random.seed(42)
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
@@ -79,40 +60,32 @@ def translate_truthfulqa(languages):
                 if split == "train":
                     samples.extend(ds)
                 else:
-                    # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
-                    for i in range(min(n_samples, len(ds))):
                         task = ds[i]
                         samples.append(task)
-                # Translate questions
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
-                # Translate choices for each sample
-                all_choices_tr = []
-                all_labels = []
                 for s in samples:
-                    # Get choices from mc1_targets
-                    choices = s["mc1_targets"]["choices"]
-                    labels = s["mc1_targets"]["labels"]
-                    # Translate choices
-                    choices_tr = [
-                        translate_google(choice, "en", lang) for choice in choices
-                    ]
-                    choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
-                    all_choices_tr.append(choices_tr)
-                    all_labels.append(labels)
                 ds_lang = Dataset.from_dict(
                     {
                         "question": questions_tr,
-                        "choices": all_choices_tr,
-                        "labels": all_labels,
                     }
                 )
                 ds_lang.push_to_hub(
@@ -122,7 +95,7 @@ def translate_truthfulqa(languages):
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
-                    f"data/translations/truthfulqa/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

 import os
 from datasets import Dataset, load_dataset
+from models import translate_google, google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 tags_uhura_truthfulqa = {
     standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
     return row
+def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
+        ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
         ds = ds.map(add_choices)
+        examples = ds["train"]
         task = ds["test"][nr]
+        return "masakhane/uhura-truthfulqa", examples, task
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
+    n_samples = 10
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
                 if split == "train":
                     samples.extend(ds)
                 else:
+                    for i in range(n_samples):
                         task = ds[i]
                         samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
+                choices_texts_concatenated = []
                 for s in samples:
+                    for choice in eval(s["choices"]):
+                        choices_texts_concatenated.append(choice)
+                choices_tr = [
+                    translate_google(c, "en", lang) for c in choices_texts_concatenated
+                ]
+                choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
+                # group into chunks of 4
+                choices_tr = [
+                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
+                ]
                 ds_lang = Dataset.from_dict(
                     {
+                        "subject": [s["subject"] for s in samples],
                         "question": questions_tr,
+                        "choices": choices_tr,
+                        "answer": [s["answer"] for s in samples],
                     }
                 )
                 ds_lang.push_to_hub(
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
+                    f"data/translations/mmlu/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

evals/datasets_/util.py CHANGED Viewed

@@ -12,10 +12,3 @@ def _get_dataset_config_names(dataset, **kwargs):
 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)
-# Cache individual dataset items to avoid reloading entire datasets
-@cache
-def _get_dataset_item(dataset, subset, split, index, **kwargs):
-    """Load a single item from a dataset efficiently"""
-    ds = load_dataset(dataset, subset, split=split, **kwargs)
-    return ds[index] if index < len(ds) else None

 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)

evals/main.py CHANGED Viewed

@@ -1,172 +1,62 @@
 import asyncio
 import pandas as pd
-import time
-from datetime import datetime, timedelta
 from models import models
 from tasks import tasks
-from languages import languages
-import os
-async def evaluate():
-    # Configuration - easily adjustable defaults
-    n_sentences = int(os.environ.get("N_SENTENCES", 20))     # Default: 20 sentences per task
-    max_languages = int(os.environ.get("MAX_LANGUAGES", 150))  # Default: 150 top languages
-    single_model = os.environ.get("SINGLE_MODEL")            # Optional: run only one specific model
-    test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes")  # Optional: skip results loading/saving
-    # Keep original DataFrames for saving metadata - distinction added for single model test runs.
-    original_models_df = pd.DataFrame(models)
-    original_languages_df = pd.DataFrame(languages)
-    # Create working copies for single evaluation runs
-    models_df = original_models_df.copy()
-    languages_df = original_languages_df.copy()
-    top_languages = languages.head(max_languages)
-    # Filter to single model if specified (only affects evaluation, not saving)
-    if single_model:
-        models_df = models_df[models_df["id"] == single_model]
-        if len(models_df) == 0:
-            print(f"Error: Model '{single_model}' not found. Available models:")
-            for model_id in original_models_df["id"]:
-                print(f"  {model_id}")
-            return pd.DataFrame()
-    print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
-    if test_mode:
-        print("TEST MODE: Skipping results loading/saving")
-    start_time = time.time()
-    # Load existing results to avoid re-evaluation (skip in test mode)
-    if test_mode:
-        old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
-    else:
-        try:
-            old_results = pd.read_json("results.json")
-            if old_results.empty:
-                old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
-        except FileNotFoundError:
-            old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
-    # Get all combinations that need evaluation
-    combis = [
-        (model, lang.bcp_47, task_name)
-        for model in models_df["id"]
-        for lang in top_languages.itertuples()
-        for task_name, task in tasks.items()
-        if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
-    ]
-    # Filter out already evaluated combinations
-    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-    if not old_results.empty:
-        completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
-        # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
-        mask = ~combis.apply(lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1)
-        combis = combis[mask]
-    # Create all evaluation tasks
-    all_tasks = []
-    for i in range(n_sentences):
-        for model, bcp_47, task_name in combis.itertuples(index=False):
-            all_tasks.append((tasks[task_name], model, bcp_47, i))
-    print(f"Running {len(all_tasks)} evaluation tasks...")
-    # For single model runs, we stop immediately on first API error to inspect.
-    # For full evaluations, we continue despite errors to get maximum coverage.
-    stop_on_error = single_model is not None
-    # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
-    batch_size = 1000
-    all_results = []
-    try:
-        for i in range(0, len(all_tasks), batch_size):
-            batch = all_tasks[i:i + batch_size]
-            batch_results = await asyncio.gather(
-                *[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in batch],
-                return_exceptions=not stop_on_error
-            )
-            all_results.extend(batch_results)
-        results = all_results
-        # Process results and logging API errors separately to understand what are the main issues.
-        valid_results = []
-        errors = []
-        for i, r in enumerate(results):
-            if isinstance(r, Exception):
-                if i < len(all_tasks):
-                    task_info = all_tasks[i]
-                    errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
-            elif isinstance(r, list):
-                valid_results.extend(r)
-            elif r is not None:
-                valid_results.append(r)
-        # log errors and store
-        if errors:
-            with open("errors.log", "w") as f:
-                f.write("model,task,error\n")
-                for error in errors:
-                    f.write(error + "\n")
-        # Track model completion (TO BE DELETED - was for local run only)
-        if valid_results:
-            completed_models = set()
-            for result in valid_results:
-                if isinstance(result, dict) and "model" in result:
-                    model = result["model"]
-                    if model not in completed_models:
-                        completed_models.add(model)
-                        print(f"Completed: {model}")
-        print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
-    # this is for local single model runs - for testing and development
-    except Exception as e:
-        print(f"EVALUATION STOPPED - API Error occurred:")
-        print(f"Error type: {type(e).__name__}")
-        print(f"Error message: {str(e)}")
-        return pd.DataFrame()
-    # Save results (skipped in test mode as we do not want to overwrite existing results)
-    if valid_results:
-        results_df = pd.DataFrame(valid_results)
-        # Aggregate results
-        results_df = (
-            results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
-            .agg({"score": "mean"})
-            .reset_index()
-        )
-        if not test_mode:
-            args = dict(orient="records", indent=2, force_ascii=False)
-            # Merge with existing results
-            if not old_results.empty:
-                results_df = pd.concat([old_results, results_df])
-                results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
-            results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
-            results_df.to_json("results.json", **args)
-            # Save model and language info (always save complete metadata, not filtered)
-            original_models_df.to_json("models.json", **args)
-            original_languages_df.to_json("languages.json", **args)
-        else:
-            print("TEST MODE: Skipping results saving")
-        elapsed = time.time() - start_time
-        print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
-        return results_df
-    return pd.DataFrame()
 if __name__ == "__main__":

 import asyncio
 import pandas as pd
+from languages import languages
 from models import models
 from tasks import tasks
+from tqdm.asyncio import tqdm_asyncio
+# ===== config =====
+n_sentences = 10
+# ===== run evaluation and aggregate results =====
+async def evaluate():
+    # FIXME we should not need this for-loop, but it helps
+    for n_languages in range(10, 101, 10):
+        print(f"running evaluations for {n_languages} languages")
+        old_results = pd.read_json("results.json")
+        old_models = pd.read_json("models.json")
+        # get all combinations of model, language and task
+        combis = [
+            (model, lang.bcp_47, task_name)
+            for model in models["id"]
+            for lang in languages.iloc[:n_languages].itertuples()
+            for task_name, task in tasks.items()
+            if task_name in models[models["id"] == model]["tasks"].iloc[0]
+        ]
+        # filter out combinations that have already been evaluated
+        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
+        # run evaluations
+        results = [
+            tasks[task_name](model, bcp_47, i)
+            for i in range(n_sentences)
+            for model, bcp_47, task_name in combis.itertuples(index=False)
+        ]
+        results = await tqdm_asyncio.gather(*results, miniters=1)
+        results = [r for group in results for r in group]
+        args = dict(orient="records", indent=2, force_ascii=False)
+        if results:
+            # aggregate results
+            results = pd.DataFrame(results)
+            results = (
+                results.groupby(["model", "bcp_47", "task", "metric"])
+                .agg({"score": "mean"})
+                .reset_index()
+            )
+            # save results
+            results = pd.concat([old_results, results])
+            results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
+            results.to_json("results.json", **args)
+        # save up-to-date info on models and languages
+        all_models = pd.concat([pd.DataFrame(models), old_models])
+        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
+        all_models.to_json("models.json", **args)
+        pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":

evals/models.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import json
 import re
 from collections import defaultdict
@@ -8,11 +7,7 @@ from os import getenv
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
-# Make ElevenLabs optional to avoid hard dependency when not using speech tasks
-try:
-    from elevenlabs import AsyncElevenLabs
-except Exception:  # ImportError or other env-specific issues
-    AsyncElevenLabs = None
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
@@ -27,17 +22,14 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
-    "openai/gpt-5",
-    "openai/gpt-5-nano",  # include if/when available
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
-    "openai/gpt-4o-2024-11-20", # 10$
-    "openai/gpt-oss-120b",
-    "anthropic/claude-3.7-sonnet",  # 15$ - added for full coverage
-    "anthropic/claude-sonnet-4",  # 15$ - added for full coverage
-    "anthropic/claude-opus-4.1",  # 15$ - added for full coverage
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
@@ -56,13 +48,10 @@ important_models = [
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
-    "moonshotai/kimi-k2",  # 0.6$ - added to prevent missing from models.json
-    "x-ai/grok-4"
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
-    "google/gemini-2.5-pro",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
@@ -70,7 +59,6 @@ blocklist = [
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
-    "perplexity/sonar-deep-research"
 ]
 transcription_models = [
@@ -97,82 +85,36 @@ def get_model(permaslug):
         and m["endpoint"]
         and not m["endpoint"]["is_free"]
     ]
     return slugs[0] if len(slugs) >= 1 else None
 @cache
 def get_historical_popular_models(date: date):
-    try:
-        raw = get("https://openrouter.ai/rankings").text
-        # Extract model data from rankingData using regex
-        import re
-        import json
-        # Find all count and model_permaslug pairs in the data
-        # Format: "count":number,"model_permaslug":"model/name"
-        pattern = r'\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"'
-        matches = re.findall(pattern, raw)
-        if matches:
-            # Aggregate model counts
-            model_counts = {}
-            for count_str, model_slug in matches:
-                count = float(count_str)
-                if not model_slug.startswith('openrouter') and model_slug != 'Others':
-                    # Remove variant suffixes for aggregation
-                    base_model = model_slug.split(':')[0]
-                    model_counts[base_model] = model_counts.get(base_model, 0) + count
-            # Sort by popularity and return top models
-            sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
-            result = []
-            for model_slug, count in sorted_models[:20]:  # Top 20
-                result.append({"slug": model_slug, "count": int(count)})
-            return result
-        else:
-            return []
-    except Exception as e:
-        return []
-@cache
 def get_current_popular_models(date: date):
-    try:
-        raw = get("https://openrouter.ai/rankings?view=day").text
-        # Extract model data from daily rankings
-        import re
-        import json
-        # Find all count and model_permaslug pairs in the daily data
-        pattern = r'\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"'
-        matches = re.findall(pattern, raw)
-        if matches:
-            # Aggregate model counts
-            model_counts = {}
-            for count_str, model_slug in matches:
-                count = float(count_str)
-                if not model_slug.startswith('openrouter') and model_slug != 'Others':
-                    # Remove variant suffixes for aggregation
-                    base_model = model_slug.split(':')[0]
-                    model_counts[base_model] = model_counts.get(base_model, 0) + count
-            # Sort by popularity and return top models
-            sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
-            result = []
-            for model_slug, count in sorted_models[:10]:  # Top 10
-                result.append({"slug": model_slug, "count": int(count)})
-            return result
-        else:
-            return []
-    except Exception as e:
-        return []
 def get_translation_models():
@@ -206,52 +148,26 @@ google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
 @cache
 async def complete(**kwargs) -> str | None:
-    # Add longer timeout for slower, premium, or reasoning-focused models
-    model_id = kwargs.get('model', '')
-    slow_model_keywords = [
-        'claude-3.5', 'claude-3.7', 'claude-4', 'sonnet-4', # Claude
-        'gpt-4', 'o1', 'o3', # OpenAI
-        'gemini-2.5', 'gemini-pro', # Google
-        'llama-4', # Meta
-        'reasoning', 'thinking' # General
-    ]
-    timeout = 120 if any(keyword in model_id for keyword in slow_model_keywords) else 60
     async with openrouter_rate_limit:
         try:
-            response = await asyncio.wait_for(
-                client.chat.completions.create(**kwargs),
-                timeout=timeout
-            )
         except BadRequestError as e:
             if "filtered" in e.message:
                 return None
             raise e
-        except asyncio.TimeoutError:
-            return None
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
-translate_client = None
-def get_google_translate_client():
-    global translate_client
-    if translate_client is None:
-        translate_client = translate.Client()
-    return translate_client
-def get_google_supported_languages():
-    client = get_google_translate_client()
-    return [l["language"] for l in client.get_languages()]
 @cache
 async def translate_google(text, source_language, target_language):
-    client = get_google_translate_client()
     async with google_rate_limit:
-        response = client.translate(
             text, source_language=source_language, target_language=target_language
         )
     return response["translatedText"]
@@ -315,14 +231,12 @@ def get_hf_metadata(row):
         return empty
     try:
         info = api.model_info(id)
-        license = ""
-        if info.card_data and hasattr(info.card_data, 'license') and info.card_data.license:
-            license = (
-                info.card_data.license
-                .replace("-", " ")
-                .replace("mit", "MIT")
-                .title()
-            )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
@@ -335,14 +249,8 @@ def get_hf_metadata(row):
 def get_cost(row):
-    """
-    row: a row from the OpenRouter models dataframe
-    """
-    try:
-        cost = float(row["endpoint"]["pricing"]["completion"])
-        return round(cost * 1_000_000, 2)
-    except (TypeError, KeyError):
-        return None
 @cache
@@ -352,17 +260,8 @@ def load_models(date: date):
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
-    all_model_candidates = set(important_models + popular_models) - set(blocklist)
-    # Validate models exist on OpenRouter before including them
-    valid_models = []
-    for model_id in all_model_candidates:
-        metadata = get_or_metadata(model_id)
-        if metadata is not None:
-            valid_models.append(model_id)
-    models = pd.DataFrame(sorted(valid_models), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -382,8 +281,7 @@ def load_models(date: date):
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
-    # Filter out expensive models to keep costs reasonable
-    models = models[models["cost"] <= 15.0].reset_index(drop=True)
     models["tasks"] = [
         ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
     ] * len(models)

 import json
 import re
 from collections import defaultdict
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
+from elevenlabs import AsyncElevenLabs
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    # "openai/gpt-4o-2024-11-20", # 10$
+    "openai/gpt-3.5-turbo-0613",  # 2$
+    # "openai/gpt-3.5-turbo",  # 1.5$
+    # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
 ]
 transcription_models = [
         and m["endpoint"]
         and not m["endpoint"]["is_free"]
     ]
+    if len(slugs) == 0:
+        # the problem is that free models typically have very high rate-limiting
+        print(f"no non-free model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None
 @cache
 def get_historical_popular_models(date: date):
+    raw = get("https://openrouter.ai/rankings").text
+    data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
+    data = json.loads(data.replace("\\", ""))
+    counts = defaultdict(int)
+    for day in data:
+        for model, count in day["ys"].items():
+            if model.startswith("openrouter") or model == "Others":
+                continue
+            counts[model.split(":")[0]] += count
+    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+    models = [get_model(model) for model, _ in counts]
+    return [m for m in models if m]
+@cache
 def get_current_popular_models(date: date):
+    raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
+    data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
+    data = json.loads(data)
+    data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
+    models = [get_model(model["model_permaslug"]) for model in data]
+    return [m for m in models if m]
 def get_translation_models():
 @cache
 async def complete(**kwargs) -> str | None:
     async with openrouter_rate_limit:
         try:
+            response = await client.chat.completions.create(**kwargs)
         except BadRequestError as e:
             if "filtered" in e.message:
                 return None
             raise e
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
+translate_client = translate.Client()
+google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 @cache
 async def translate_google(text, source_language, target_language):
     async with google_rate_limit:
+        response = translate_client.translate(
             text, source_language=source_language, target_language=target_language
         )
     return response["translatedText"]
         return empty
     try:
         info = api.model_info(id)
+        license = (
+            (info.card_data.license or "")
+            .replace("-", " ")
+            .replace("mit", "MIT")
+            .title()
+        )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
 def get_cost(row):
+    cost = float(row["endpoint"]["pricing"]["completion"])
+    return round(cost * 1_000_000, 2)
 @cache
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
+    models = set(important_models + popular_models) - set(blocklist)
+    models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
+    # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     models["tasks"] = [
         ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
     ] * len(models)

evals/tasks.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import random
 from functools import partial
 from textwrap import dedent
@@ -11,8 +10,10 @@ from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
 from languages import languages, script_name
-from models import complete, transcribe
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
@@ -26,6 +27,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
     frac=1, weights="speakers", replace=True, random_state=42
 )
 async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
@@ -43,20 +47,31 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
-    translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
-    prediction = await complete(
-        model=model,
-        messages=[
-            {
-                "role": "user",
-                "content": translation_prompt,
-            }
-        ],
-        temperature=0,
-        max_tokens=1024,
-    )
     if prediction:
         bleu_score = bleu.compute(
             predictions=[prediction],
@@ -69,9 +84,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     else:
         bleu_score = {"bleu": 0}
         chrf_score = {"score": 0}
     return [
         {
             "model": model,
@@ -79,7 +91,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
-            "origin": "human", # FLORES+ is human-translated
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
@@ -101,36 +112,57 @@ async def classify_and_evaluate(model, bcp_47, nr):
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
-    test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
-    prompt = f"""Classify the following text into one of these topics: {', '.join(top_topics)}.
-Reply with only the topic name.
-Text:
-{test_paragraph.text}
-"""
-    response = await complete(
-        model=model,
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0,
-        max_tokens=30,
     )
-    pred = response.lower().strip() if response else ""
-    true = test_paragraph.topic.lower().strip()
-    others = [t for t in top_topics if t != true]
-    acc = (
-        int(
-            pred.startswith(true)
-            or (true in pred and not any(o in pred for o in others))
-        )
-        if pred
-        else 0
-    )
     return [
         {
             "model": model,
@@ -138,7 +170,6 @@ Text:
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
-            "origin": "human", # FLORES+ is human-translated
             "sentence_nr": nr,
         }
     ]
@@ -203,41 +234,37 @@ def format_multiple_choice(item):
     C: {item["choices"][2]}
     D: {item["choices"][3]}
-    Answer with the letter of the correct answer."""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
     if not task:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024,
-    )
-    if response and "####" in response:
-        answer = response.split("####")[-1].strip()
-        acc = int(answer[:1] == task["answer"])
-    else:
-        acc = 0
-        answer = "NO_ANSWER"
     return [
         {
             "model": model,
@@ -245,41 +272,39 @@ Response format: <reasoning> #### <letter>
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,  # Add origin tag to results
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024,
-    )
-    if response and "####" in response:
-        answer = response.split("####")[-1].strip()
-        acc = int(answer[:1] == task["answer"])
-    else:
-        acc = 0
-        answer = "NO_ANSWER"
     return [
         {
             "model": model,
@@ -287,7 +312,6 @@ Response format: <reasoning> #### <letter>
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
@@ -308,48 +332,40 @@ def format_multiple_choice_truthfulqa(item):
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
-    # Find the correct answer
     try:
-        correct_choice_index = task["labels"].index(1)
-        answer = letters[correct_choice_index]
-    except (ValueError, IndexError):
-        # Handle cases where there is no correct answer or labels are malformed
-        return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice_truthfulqa(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024, # Increased for reasoning
-    )
-    if response and "####" in response:
-        pred_answer = response.split("####")[-1].strip()
-        acc = int(pred_answer[:1].upper() == answer)
-    else:
-        acc = 0
-        pred_answer = "NO_ANSWER"
     return [
         {
             "model": model,
@@ -357,43 +373,34 @@ Response format: <reasoning> #### <letter>
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
-    ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
-Response format: <reasoning> #### <number>
----
-{question["question"]}""",
-        },
-    ]
     response = await complete(
         model=model,
-        messages=messages,
         temperature=0,
         max_tokens=1024,
     )
-    if response and "####" in response:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
         accuracy = 0
-        number = "NO_ANSWER"
     return [
         {
@@ -402,7 +409,6 @@ Response format: <reasoning> #### <number>
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
@@ -443,8 +449,10 @@ tasks = {
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
 }

 import random
 from functools import partial
 from textwrap import dedent
 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
+from google.cloud import translate_v2 as translate
+from langcodes import closest_supported_match
 from languages import languages, script_name
+from models import complete, transcribe, translate_google
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
     frac=1, weights="speakers", replace=True, random_state=42
 )
+translate_client = translate.Client()
+supported_languages = [l["language"] for l in translate_client.get_languages()]
 async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
+    if model == "google/translate-v2":
+        original_language = closest_supported_match(
+            original_language, supported_languages
+        )
+        target_language = closest_supported_match(target_language, supported_languages)
+        if original_language == target_language:
+            prediction = original_sentence
+        elif original_language is None or target_language is None:
+            prediction = None
+        else:
+            prediction = await translate_google(
+                original_sentence, original_language.bcp_47, target_language.bcp_47
+            )
+    else:
+        prediction = await complete(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
+                }
+            ],
+            temperature=0,
+            max_tokens=1024,
+        )
     if prediction:
         bleu_score = bleu.compute(
             predictions=[prediction],
     else:
         bleu_score = {"bleu": 0}
         chrf_score = {"score": 0}
     return [
         {
             "model": model,
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
+    examples = pd.concat(
+        [
+            paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
+            for t in top_topics
+        ]
+    ).sample(frac=1, random_state=nr)
+    test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
+        frac=1, random_state=42
     )
+    test_paragraph = test_paragraphs.iloc[nr]
+    def format_prompt(text):
+        return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
+    messages = []
+    for example in examples.itertuples():
+        messages += [
+            {"role": "user", "content": format_prompt(example.text)},
+            {"role": "assistant", "content": example.topic},
+        ]
+    # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
+    # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
+    try:
+        pred = await complete(
+            model=model,
+            messages=[
+                *messages,
+                {
+                    "role": "user",
+                    "content": format_prompt(test_paragraph.text),
+                },
+            ],
+            temperature=0,
+            max_tokens=30,
+        )
+        true = test_paragraph.topic
+        others = [t for t in top_topics if t != true]
+        acc = (
+            int(
+                pred.startswith(true)
+                or (true in pred and not any(o in pred for o in others))
+            )
+            if pred
+            else 0
+        )
+    except Exception as e:
+        if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
+            print(f"Max tokens exceeded for {model} in {bcp_47}")
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
     C: {item["choices"][2]}
     D: {item["choices"][3]}
+    A|B|C|D?"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_mmlu(language_bcp_47, nr)
     if not task:
         return []
+    messages = []
+    for example in examples:
+        messages += [
+            {"role": "user", "content": format_multiple_choice(example)},
+            {"role": "assistant", "content": example["answer"]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
+    messages = []
+    for example in examples:
+        messages += [
+            {"role": "user", "content": format_multiple_choice(example)},
+            {"role": "assistant", "content": example["answer"]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
+    text += "|".join(letters[: len(item["choices"])]) + "?"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
+    task = shuffle_choices_and_labels(task)
+    answer = letters[task["labels"].index(1)]
+    messages = []
+    for example in examples:
+        example = shuffle_choices_and_labels(example)
+        messages += [
+            {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
+            {"role": "assistant", "content": letters[example["labels"].index(1)]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
     try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == answer)
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
+    system_prompt = """
+    Solve the math problem. Use reasoning, and finally give the answer as a number.
+    Response format: <reasoning> #### <number>
+    """
+    system_prompt = dedent(system_prompt).strip()
+    ds_slug, question = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
     response = await complete(
         model=model,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": question["question"]},
+        ],
         temperature=0,
         max_tokens=1024,
     )
+    if response and len(response.split("####")) == 2:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
         accuracy = 0
     return [
         {
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
             "sentence_nr": nr,
         }
     ]
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
+    # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
+    # "asr": transcribe_and_evaluate,
 }

frontend/src/App.js CHANGED Viewed

@@ -19,14 +19,9 @@ function App () {
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
-  const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
-  // Add state for carousel items
-  const [carouselItems, setCarouselItems] = useState([])
-  const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
   useEffect(() => {
     fetch('/api/data', {
@@ -41,7 +36,6 @@ function App () {
       })
       .then(jsonData => {
         setData(jsonData)
-        setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
         setLoading(false)
       })
       .catch(err => {
@@ -50,27 +44,8 @@ function App () {
       })
   }, [selectedLanguages])
-  // Create carousel items when data is loaded
-  useEffect(() => {
-    if (data) {
-      // Add a small delay to ensure components are ready
-      const timer = setTimeout(() => {
-        setCarouselItems([
-          <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
-          <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
-          <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
-          <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
-          <CostPlot key="costplot-4" data={data} width={750} height={500} />
-        ]);
-      }, 100);
-      return () => clearTimeout(timer);
-    }
-  }, [data])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
@@ -80,44 +55,6 @@ function App () {
     return () => window.removeEventListener('resize', handleResize)
   }, [])
-  // Create full-screen carousel items when data or window size changes
-  useEffect(() => {
-    if (data) {
-      const timer = setTimeout(() => {
-        setFullScreenCarouselItems([
-          <WorldMap
-            key="fs-worldmap-0"
-            data={data.countries}
-            allLanguages={data.language_table}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <LanguagePlot
-            key="fs-langplot-1"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <SpeakerPlot
-            key="fs-speakerplot-2"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <HistoryPlot
-            key="fs-histplot-3"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
-        ]);
-      }, 100);
-      return () => clearTimeout(timer);
-    }
-  }, [data, windowWidth, windowHeight])
   return (
     <PrimeReactProvider>
       <div
@@ -132,50 +69,35 @@ function App () {
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
-            padding: '1rem 1.5rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
-            textAlign: 'center',
-            lineHeight: '1.5',
-            position: 'relative'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
-          active development. Evaluation results are not yet final. Note that the visualised results currently stem from sampling 20 instances per combination of model, task, and language. We have evaluated 139 languages across 41 models and 7 tasks, totaling over 300,000 individual evaluations. Only the top 150 languages by speaker count are included in the current evaluation scope. More extensive evaluation runs will be released later this year.
-        </div>
-        <div
-          style={{
-            display: 'flex',
-            justifyContent: 'flex-end',
-            padding: '0 1.5rem',
-            marginBottom: '1rem'
-          }}
-        >
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
-              color: '#6c757d',
-              fontSize: '1rem',
-              fontWeight: '500',
-              padding: '0.5rem 1rem',
-              borderRadius: '0.375rem',
-              backgroundColor: '#f8f9fa',
-              border: '1px solid #e9ecef',
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              transition: 'all 0.2s ease',
-              ':hover': {
-                backgroundColor: '#e9ecef',
-                color: '#495057'
-              }
             }}
           >
-            <i className='pi pi-github' title='View on GitHub' />
             GitHub
           </a>
         </div>
@@ -227,88 +149,39 @@ function App () {
           <div
             style={{
               display: 'flex',
-              gap: '0.75rem',
-              marginBottom: '2rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
-            <button
               onClick={() => setAboutVisible(true)}
               style={{
-                background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
-                color: 'white',
-                border: 'none',
-                padding: '0.75rem 1.5rem',
-                borderRadius: '12px',
-                fontSize: '0.95rem',
-                fontWeight: '500',
-                cursor: 'pointer',
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
-                transition: 'all 0.3s ease',
-                ':hover': {
-                  transform: 'translateY(-2px)',
-                  boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
-                }
-              }}
-              onMouseEnter={(e) => {
-                e.target.style.transform = 'translateY(-2px)';
-                e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
               }}
-              onMouseLeave={(e) => {
-                e.target.style.transform = 'translateY(0)';
-                e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
-              }}
-            >
-              <span style={{ fontSize: '1.1rem' }}>📚</span>
-              About this tool
-            </button>
-            <button
               onClick={() => setContributeVisible(true)}
-              title='This feature is on our roadmap and will be available soon.'
               style={{
-                background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
-                color: '#6b46c1',
-                border: 'none',
-                padding: '0.75rem 1.5rem',
-                borderRadius: '12px',
-                fontSize: '0.95rem',
-                fontWeight: '500',
-                cursor: 'pointer',
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
-                transition: 'all 0.3s ease',
-                position: 'relative',
-                overflow: 'hidden'
               }}
-              onMouseEnter={(e) => {
-                e.target.style.transform = 'translateY(-2px)';
-                e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
-              }}
-              onMouseLeave={(e) => {
-                e.target.style.transform = 'translateY(0)';
-                e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
-              }}
-            >
-              <span style={{ fontSize: '1.1rem' }}>🚀</span>
-              Add your model
-              <span style={{
-                fontSize: '0.75rem',
-                backgroundColor: 'rgba(107, 70, 193, 0.15)',
-                padding: '0.2rem 0.5rem',
-                borderRadius: '6px',
-                marginLeft: '0.5rem',
-                fontWeight: '600'
-              }}>
-                soon
-              </span>
-            </button>
           </div>
           {data && (
@@ -347,7 +220,6 @@ function App () {
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
-                machineTranslatedMetrics={machineTranslatedMetrics}
               />
               <LanguageTable
                 data={data.language_table}
@@ -376,18 +248,20 @@ function App () {
                     color: '#666'
                   }}
                 />
-                {carouselItems.length > 0 && (
-                  <Carousel
-                    key={`main-carousel-${carouselItems.length}-${Date.now()}`}
-                    value={carouselItems}
-                    numScroll={1}
-                    numVisible={1}
-                    itemTemplate={item => item}
-                    circular={false}
-                    activeIndex={0}
-                    style={{ width: '100%', minHeight: '650px' }}
-                  />
-                )}
               </div>
             </>
           )}
@@ -535,16 +409,36 @@ function App () {
           modal
           header={null}
         >
-          {fullScreenCarouselItems.length > 0 && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
-                key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
-                value={fullScreenCarouselItems}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
-                circular={false}
-                activeIndex={0}
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
@@ -555,4 +449,4 @@ function App () {
   )
 }
-export default App

   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
   useEffect(() => {
     fetch('/api/data', {
       })
       .then(jsonData => {
         setData(jsonData)
         setLoading(false)
       })
       .catch(err => {
       })
   }, [selectedLanguages])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
     return () => window.removeEventListener('resize', handleResize)
   }, [])
   return (
     <PrimeReactProvider>
       <div
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
+            padding: '0.75rem 1.25rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
+            textAlign: 'center'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
+          active development. Evaluation results are not yet final.
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
+              color: '#856404',
+              float: 'right',
+              fontSize: '1.2rem',
+              fontWeight: 'bold',
+              padding: '0 0.5rem',
+              borderRadius: '3px',
+              backgroundColor: 'rgba(255,255,255,0.3)'
             }}
           >
+            <i
+              className='pi pi-github'
+              title='View on GitHub'
+              style={{ marginRight: '0.3rem' }}
+            />
             GitHub
           </a>
         </div>
           <div
             style={{
               display: 'flex',
+              gap: '1rem',
+              marginBottom: '1.5rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
+            <Button
+              label='📚 About this tool'
+              className='p-button-text'
               onClick={() => setAboutVisible(true)}
               style={{
+                color: '#666',
+                border: '1px solid #ddd',
+                padding: '0.5rem 1rem',
+                borderRadius: '4px',
+                fontSize: '0.9rem'
               }}
+            />
+            <Button
+              label='🚀 Add your model (soon)'
+              className='p-button-text'
               onClick={() => setContributeVisible(true)}
+              tooltip='This feature is on our roadmap and will be available soon.'
+              tooltipOptions={{ position: 'bottom' }}
               style={{
+                color: '#666',
+                border: '1px solid #ddd',
+                padding: '0.5rem 1rem',
+                borderRadius: '4px',
+                fontSize: '0.9rem'
               }}
+            />
           </div>
           {data && (
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
               />
               <LanguageTable
                 data={data.language_table}
                     color: '#666'
                   }}
                 />
+                <Carousel
+                  value={[
+                    <WorldMap data={data.countries} />,
+                    <LanguagePlot data={data} />,
+                    <SpeakerPlot data={data} />,
+                    <HistoryPlot data={data} />,
+                    <CostPlot data={data} />
+                  ]}
+                  numScroll={1}
+                  numVisible={1}
+                  itemTemplate={item => item}
+                  circular
+                  style={{ width: '100%', minHeight: '650px' }}
+                />
               </div>
             </>
           )}
           modal
           header={null}
         >
+          {data && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
+                value={[
+                  <WorldMap
+                    data={data.countries}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <LanguagePlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <SpeakerPlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <HistoryPlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <CostPlot data={data} />
+                ]}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
+                circular
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
   )
 }
+export default App

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
-              maxAverage: models[models.length - 1]?.maxAverage || 0
             }
           ],
           {
             x: d => d.creation_date,
-            y: d => d.maxAverage || 0,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
+              maxAverage: models[models.length - 1].maxAverage
             }
           ],
           {
             x: d => d.creation_date,
+            y: d => d.maxAverage,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

frontend/src/components/LanguageTable.js CHANGED Viewed

@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
-      {ScoreColumns()}
     </DataTable>
   )
 }

         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
+      {ScoreColumns}
     </DataTable>
   )
 }

frontend/src/components/ModelTable.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
-const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
-    const p = 10;
-    const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
-    const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
-    const [_value, _setValue] = useState([start, stop]);
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
-              : [p ** _value[0], p ** _value[1]],
-        });
-      }, 1000);
-      return () => clearTimeout(timer);
-    }, [_value, onChange, min, max]);
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
@@ -147,35 +147,21 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
   }
   const costBodyTemplate = rowData => {
-    return (
-      <div style={{ textAlign: 'center' }}>
-        {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
-      </div>
-    )
   }
   const getHeaderText = () => {
-    // Count languages that have any evaluation data (any task scores available)
-    const evaluatedLanguagesCount = allLanguages.filter(lang => {
-      // Check if language has any task scores (not just average)
-      const hasAnyScores = [
-        'translation_from_bleu',
-        'translation_to_bleu',
-        'classification_accuracy',
-        'mmlu_accuracy',
-        'arc_accuracy',
-        'truthfulqa_accuracy',
-        'mgsm_accuracy'
-      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
-      return hasAnyScores
-    }).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
-            Performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
@@ -259,7 +245,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
-      {ScoreColumns(machineTranslatedMetrics)}
     </DataTable>
   )
 }

 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
+const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
+    const p = 10
+    const start = value === null ? min : Math.log(value[0]) / Math.log(p)
+    const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
+    const [_value, _setValue] = useState([start, stop])
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
+              : [p ** _value[0], p ** _value[1]]
+        })
+      }, 1000)
+      return () => clearTimeout(timer)
+    }, [_value, onChange, min, max])
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
   }
   const costBodyTemplate = rowData => {
+    return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
   }
   const getHeaderText = () => {
+    // Count languages that have evaluation data (average score available)
+    const evaluatedLanguagesCount = allLanguages.filter(lang =>
+      lang.average !== null && lang.average !== undefined
+    ).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
+            Average performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
+      {ScoreColumns}
     </DataTable>
   )
 }

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -2,28 +2,21 @@ import { Column } from 'primereact/column'
 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
-  const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
   return rowData => {
     const score = rowData[field]
-    // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
-    // otherwise fall back to global list
-    const rowFlagKey = `${field}_is_machine`
-    const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
-    const isMachineTranslated = hasRowFlag
-      ? !!rowData[rowFlagKey]
-      : machineTranslatedMetrics.includes(field)
-    return ScoreField(score, minScore, maxScore, isMachineTranslated)
   }
 }
-const ScoreColumns = (machineTranslatedMetrics = []) => [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
-    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
@@ -33,8 +26,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -45,8 +37,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -57,8 +48,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -79,8 +69,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -91,8 +80,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -103,8 +91,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
+  const { minScore = 0, maxScore = 1 } = options
   return rowData => {
     const score = rowData[field]
+    return ScoreField(score, minScore, maxScore)
   }
 }
+const ScoreColumns = [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
+    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

frontend/src/components/ScoreField.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
@@ -50,7 +50,6 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
-        {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
       </span>
     </div>
   )

+const ScoreField = (score, minScore, maxScore) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
       </span>
     </div>
   )

frontend/src/components/SpeakerPlot.js CHANGED Viewed

@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
-        ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
-        })] : [])
       ]
     })
     containerRef.current.append(plot)

           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
+        Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
+        })
       ]
     })
     containerRef.current.append(plot)

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -26,13 +26,13 @@ const makeTitle = data => d => {
         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
-          } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
-  return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
 }
-const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
@@ -48,22 +48,8 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
       acc[country.iso2] = country
       return acc
     }, {})
-    // Count languages that have any evaluation data
-    const evaluatedLanguagesCount = allLanguages.filter(lang => {
-      const hasAnyScores = [
-        'translation_from_bleu',
-        'translation_to_bleu',
-        'classification_accuracy',
-        'mmlu_accuracy',
-        'arc_accuracy',
-        'truthfulqa_accuracy',
-        'mgsm_accuracy'
-      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
-      return hasAnyScores
-    }).length
     const plot = Plot.plot({
-      subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
       width: width,
       height: height,
       projection: 'equal-earth',
@@ -75,12 +61,11 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
         })
       ],
       color: {
-        scheme: 'RdYlGn',
-        unknown: '#d0d0d0',
         label: 'Score',
         legend: true,
-        domain: [0, 1],
-        pivot: 0.5
       },
       style: {
         fontFamily: 'monospace'

         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
+          } – ${a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
+  return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
 }
+const WorldMap = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
       acc[country.iso2] = country
       return acc
     }, {})
     const plot = Plot.plot({
+      subtitle: 'Language Proficiency Score by Country',
       width: width,
       height: height,
       projection: 'equal-earth',
         })
       ],
       color: {
+        scheme: 'Greens',
+        unknown: 'gray',
         label: 'Score',
         legend: true,
+        domain: [0, 1]
       },
       style: {
         fontFamily: 'monospace'

languages.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
-    "commonvoice_hours":2683.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
@@ -32,7 +32,7 @@
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
-    "commonvoice_locale":"hi",
     "in_benchmark":true
   },
   {
@@ -43,7 +43,7 @@
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
-    "commonvoice_hours":449.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
@@ -79,7 +79,7 @@
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
-    "commonvoice_hours":1073.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
@@ -103,7 +103,7 @@
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
-    "commonvoice_hours":181.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
@@ -115,7 +115,7 @@
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
-    "commonvoice_hours":2.5,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
@@ -127,7 +127,7 @@
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
-    "commonvoice_hours":247.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
@@ -139,7 +139,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
-    "commonvoice_hours":412.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
@@ -151,7 +151,7 @@
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
-    "commonvoice_hours":34.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
@@ -163,7 +163,7 @@
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
-    "commonvoice_hours":1372.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
@@ -379,7 +379,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
-    "commonvoice_hours":82.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
@@ -439,7 +439,7 @@
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
-    "commonvoice_hours":176.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
@@ -619,7 +619,7 @@
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
-    "commonvoice_hours":123.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
@@ -655,7 +655,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
-    "commonvoice_hours":6.4,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
@@ -979,7 +979,7 @@
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
-    "commonvoice_hours":2.3,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
@@ -1027,7 +1027,7 @@
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
-    "commonvoice_hours":94.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
@@ -1099,7 +1099,7 @@
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
-    "commonvoice_hours":136.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
@@ -1183,7 +1183,7 @@
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
-    "commonvoice_hours":1812.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
@@ -1207,7 +1207,7 @@
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
-    "commonvoice_hours":0.6,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
@@ -1243,7 +1243,7 @@
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
-    "commonvoice_hours":0.6,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
@@ -1291,7 +1291,7 @@
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
-    "commonvoice_hours":2884.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
@@ -1303,7 +1303,7 @@
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
-    "commonvoice_hours":2.0,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
@@ -1375,7 +1375,7 @@
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
-    "commonvoice_hours":437.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
@@ -1519,7 +1519,7 @@
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":71.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
@@ -1555,7 +1555,7 @@
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
-    "commonvoice_hours":52.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
@@ -1675,7 +1675,7 @@
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":4.5,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
@@ -1747,7 +1747,7 @@
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
-    "commonvoice_hours":1.8,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
@@ -2155,7 +2155,7 @@
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
-    "commonvoice_hours":167.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
@@ -2167,7 +2167,7 @@
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
-    "commonvoice_hours":166.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
@@ -2323,7 +2323,7 @@
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
@@ -2623,7 +2623,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
@@ -2695,7 +2695,7 @@
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
-    "commonvoice_hours":1.9,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
@@ -3175,8 +3175,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"seh",
     "in_benchmark":false
   },
   {
@@ -3319,8 +3319,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"mfe",
     "in_benchmark":false
   },
   {
@@ -3331,7 +3331,7 @@
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
-    "commonvoice_hours":9.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
@@ -3487,7 +3487,7 @@
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
-    "commonvoice_hours":263.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
@@ -3535,7 +3535,7 @@
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":453.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
@@ -3559,7 +3559,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":108.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
@@ -3679,7 +3679,7 @@
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
-    "commonvoice_hours":1.8,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
@@ -3991,8 +3991,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"gaa",
     "in_benchmark":false
   },
   {
@@ -4099,8 +4099,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"pcd",
     "in_benchmark":false
   },
   {
@@ -4351,7 +4351,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":30.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
@@ -4651,7 +4651,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":32.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
@@ -5011,7 +5011,7 @@
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
-    "commonvoice_hours":1.3,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
@@ -7879,7 +7879,7 @@
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":1437.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
+    "commonvoice_hours":2674.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
+    "commonvoice_locale":"hi-IN",
     "in_benchmark":true
   },
   {
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
+    "commonvoice_hours":448.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
+    "commonvoice_hours":1065.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
+    "commonvoice_hours":180.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
+    "commonvoice_hours":2.3,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
+    "commonvoice_hours":245.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
+    "commonvoice_hours":411.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
+    "commonvoice_hours":33.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
+    "commonvoice_hours":1369.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
+    "commonvoice_hours":81.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
+    "commonvoice_hours":175.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
+    "commonvoice_hours":120.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
+    "commonvoice_hours":6.3,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
+    "commonvoice_hours":2.2,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
+    "commonvoice_hours":93.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
+    "commonvoice_hours":135.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
+    "commonvoice_hours":1810.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
+    "commonvoice_hours":0.4,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
+    "commonvoice_hours":0.5,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
+    "commonvoice_hours":2863.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
+    "commonvoice_hours":1.4,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
+    "commonvoice_hours":411.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":69.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
+    "commonvoice_hours":51.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":4.0,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
+    "commonvoice_hours":0.5,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
+    "commonvoice_hours":166.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
+    "commonvoice_hours":117.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":1.2,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.9,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
+    "commonvoice_hours":1.8,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
+    "commonvoice_hours":8.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
+    "commonvoice_hours":262.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":440.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":83.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
+    "commonvoice_hours":0.7,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":29.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":30.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":1436.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

models.json CHANGED Viewed

@@ -20,15 +20,15 @@
     ]
   },
   {
-    "id":"anthropic\/claude-3.7-sonnet",
-    "name":"Claude 3.7 Sonnet",
     "provider_name":"Anthropic",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1740355200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -40,15 +40,15 @@
     ]
   },
   {
-    "id":"anthropic\/claude-sonnet-4",
-    "name":"Claude Sonnet 4",
     "provider_name":"Anthropic",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1747872000000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -60,15 +60,15 @@
     ]
   },
   {
-    "id":"cohere\/command-r-plus-04-2024",
-    "name":"Command R+ (04-2024)",
-    "provider_name":"Cohere",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1712016000000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -83,7 +83,7 @@
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
-    "cost":0.8,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
     "size":684531386000.0,
     "type":"open-source",
@@ -120,15 +120,35 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-chat-v3.1",
-    "name":"DeepSeek V3.1",
     "provider_name":"DeepSeek",
     "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-V3.1",
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
-    "creation_date":1755734400000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -200,15 +220,145 @@
     ]
   },
   {
-    "id":"google\/gemma-3-12b-it",
-    "name":"Gemma 3 12B",
     "provider_name":"Google",
-    "cost":0.0,
-    "hf_id":"google\/gemma-3-12b-it",
-    "size":12187325040.0,
-    "type":"open-source",
-    "license":"Gemma",
-    "creation_date":1740787200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -240,15 +390,30 @@
     ]
   },
   {
-    "id":"meta-llama\/llama-3-70b-instruct",
-    "name":"Llama 3 70B Instruct",
-    "provider_name":"Meta",
-    "cost":0.4,
-    "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
-    "size":70553706496.0,
     "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1713312000000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -260,12 +425,12 @@
     ]
   },
   {
-    "id":"meta-llama\/llama-3-8b-instruct",
-    "name":"Llama 3 8B Instruct",
     "provider_name":"Meta",
-    "cost":0.06,
-    "hf_id":"meta-llama\/Meta-Llama-3-8B-Instruct",
-    "size":8030261248.0,
     "type":"open-source",
     "license":"Llama3",
     "creation_date":1713312000000,
@@ -299,6 +464,30 @@
       "mgsm"
     ]
   },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
@@ -339,26 +528,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"meta-llama\/llama-guard-3-8b",
-    "name":"Llama Guard 3 8B",
-    "provider_name":"Llama Guard 3 8B",
-    "cost":0.06,
-    "hf_id":"meta-llama\/Llama-Guard-3-8B",
-    "size":8030261248.0,
-    "type":"open-source",
-    "license":"Llama3.1",
-    "creation_date":1721606400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"microsoft\/phi-4",
     "name":"Phi 4",
@@ -399,26 +568,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"microsoft\/wizardlm-2-8x22b",
-    "name":"WizardLM-2 8x22B",
-    "provider_name":"WizardLM-2 8x22B",
-    "cost":0.48,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1713225600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
@@ -459,26 +608,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"mistralai\/mistral-small-24b-instruct-2501",
-    "name":"Mistral Small 3",
-    "provider_name":"Mistral",
-    "cost":0.0,
-    "hf_id":"mistralai\/Mistral-Small-24B-Instruct-2501",
-    "size":23572403200.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1738022400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"mistralai\/mistral-small-3.1-24b-instruct",
     "name":"Mistral Small 3.1 24B",
@@ -499,106 +628,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"moonshotai\/kimi-k2",
-    "name":"Kimi K2",
-    "provider_name":"MoonshotAI",
-    "cost":0.0,
-    "hf_id":"moonshotai\/Kimi-K2-Instruct",
-    "size":null,
-    "type":"open-source",
-    "license":"Other",
-    "creation_date":1752192000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"nousresearch\/deephermes-3-llama-3-8b-preview",
-    "name":"DeepHermes 3 Llama 3 8B Preview",
-    "provider_name":"Nous",
-    "cost":0.0,
-    "hf_id":"NousResearch\/DeepHermes-3-Llama-3-8B-Preview",
-    "size":8030261248.0,
-    "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1739318400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"nousresearch\/hermes-2-pro-llama-3-8b",
-    "name":"Hermes 2 Pro - Llama-3 8B",
-    "provider_name":"NousResearch",
-    "cost":0.04,
-    "hf_id":"NousResearch\/Hermes-2-Pro-Llama-3-8B",
-    "size":8030523392.0,
-    "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1714435200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"nousresearch\/hermes-3-llama-3.1-405b",
-    "name":"Hermes 3 405B Instruct",
-    "provider_name":"Nous",
-    "cost":0.8,
-    "hf_id":"NousResearch\/Hermes-3-Llama-3.1-405B",
-    "size":405853388800.0,
-    "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1723507200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"nousresearch\/hermes-3-llama-3.1-70b",
-    "name":"Hermes 3 70B Instruct",
-    "provider_name":"Nous",
-    "cost":0.28,
-    "hf_id":"NousResearch\/Hermes-3-Llama-3.1-70B",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1722211200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"openai\/gpt-3.5-turbo-0613",
     "name":"GPT-3.5 Turbo (older v0613)",
@@ -679,26 +708,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"openai\/gpt-4o-2024-11-20",
-    "name":"GPT-4o (2024-11-20)",
-    "provider_name":"OpenAI",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1732060800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
@@ -719,86 +728,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"openai\/gpt-5",
-    "name":"GPT-5",
-    "provider_name":"OpenAI",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1754524800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"openai\/gpt-5-nano",
-    "name":"GPT-5 Nano",
-    "provider_name":"OpenAI",
-    "cost":0.4,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1754524800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"openai\/gpt-oss-120b",
-    "name":"gpt-oss-120b",
-    "provider_name":"OpenAI",
-    "cost":0.0,
-    "hf_id":"openai\/gpt-oss-120b",
-    "size":120412337472.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1754265600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"openai\/gpt-oss-20b",
-    "name":"gpt-oss-20b",
-    "provider_name":"OpenAI",
-    "cost":0.0,
-    "hf_id":"openai\/gpt-oss-20b",
-    "size":21511953984.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1754265600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
@@ -843,7 +772,7 @@
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
-    "cost":0.07,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
@@ -858,140 +787,5 @@
       "truthfulqa",
       "mgsm"
     ]
-  },
-  {
-    "id":"sao10k\/l3-lunaris-8b",
-    "name":"Llama 3 8B Lunaris",
-    "provider_name":"Sao10K",
-    "cost":0.05,
-    "hf_id":"Sao10K\/L3-8B-Lunaris-v1",
-    "size":8030261248.0,
-    "type":"open-source",
-    "license":"Llama3",
-    "creation_date":1719360000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
-    "name":"Typhoon2 70B Instruct",
-    "provider_name":"Typhoon2 70B Instruct",
-    "cost":0.88,
-    "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Llama3.1",
-    "creation_date":1734220800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"shisa-ai\/shisa-v2-llama3.3-70b",
-    "name":"Shisa V2 Llama 3.3 70B ",
-    "provider_name":"Shisa AI",
-    "cost":0.0,
-    "hf_id":"shisa-ai\/shisa-v2-llama3.3-70b",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Llama3.3",
-    "creation_date":1744502400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"x-ai\/grok-2-vision-1212",
-    "name":"Grok 2 Vision 1212",
-    "provider_name":"xAI",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1734220800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"x-ai\/grok-4",
-    "name":"Grok 4",
-    "provider_name":"xAI",
-    "cost":15.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1752019200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"z-ai\/glm-4.5",
-    "name":"GLM 4.5",
-    "provider_name":"Z.AI",
-    "cost":1.32,
-    "hf_id":"zai-org\/GLM-4.5",
-    "size":358337791296.0,
-    "type":"open-source",
-    "license":"Mit",
-    "creation_date":1752969600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/translate-v2",
-    "name":"Google Translate",
-    "provider_name":"Google",
-    "cost":20.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":null,
-    "tasks":[
-      "translation_from",
-      "translation_to"
-    ]
   }
 ]

     ]
   },
   {
+    "id":"anthropic\/claude-3.5-sonnet",
+    "name":"Claude 3.5 Sonnet",
     "provider_name":"Anthropic",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1729555200000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"anthropic\/claude-3.7-sonnet",
+    "name":"Claude 3.7 Sonnet",
     "provider_name":"Anthropic",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1740355200000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"anthropic\/claude-sonnet-4",
+    "name":"Claude Sonnet 4",
+    "provider_name":"Anthropic",
     "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1747872000000,
     "tasks":[
       "translation_from",
       "translation_to",
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
+    "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
     "size":684531386000.0,
     "type":"open-source",
     ]
   },
   {
+    "id":"deepseek\/deepseek-r1",
+    "name":"R1",
     "provider_name":"DeepSeek",
     "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1",
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
+    "creation_date":1737331200000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"deepseek\/deepseek-r1-0528",
+    "name":"R1 0528",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
+    "size":684531386000.0,
+    "type":"open-source",
+    "license":"Mit",
+    "creation_date":1748390400000.0,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemini-2.5-flash-lite-preview-06-17",
+    "name":"Gemini 2.5 Flash Lite Preview 06-17",
     "provider_name":"Google",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1750118400000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-flash-preview",
+    "name":"Gemini 2.5 Flash Preview 04-17",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1744848000000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-flash-preview-05-20",
+    "name":"Gemini 2.5 Flash Preview 05-20",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1747699200000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro",
+    "name":"Gemini 2.5 Pro",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1750118400000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro-preview",
+    "name":"Gemini 2.5 Pro Preview 06-05",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1749081600000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro-preview-05-06",
+    "name":"Gemini 2.5 Pro Preview 05-06",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1746576000000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-flash-1.5",
+    "name":"Gemini 1.5 Flash ",
+    "provider_name":"Google",
+    "cost":0.3,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1715644800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-flash-1.5-8b",
+    "name":"Gemini 1.5 Flash 8B",
+    "provider_name":"Google",
+    "cost":0.15,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1727913600000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/translate-v2",
+    "name":"Google Translate",
+    "provider_name":"Google",
+    "cost":20.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":null,
+    "tasks":[
+      "translation_from",
+      "translation_to"
+    ]
+  },
+  {
+    "id":"gryphe\/mythomax-l2-13b",
+    "name":"MythoMax 13B",
+    "provider_name":"MythoMax 13B",
+    "cost":0.07,
+    "hf_id":"Gryphe\/MythoMax-L2-13b",
+    "size":null,
     "type":"open-source",
+    "license":"Other",
+    "creation_date":1691625600000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"meta-llama\/llama-3-70b-instruct",
+    "name":"Llama 3 70B Instruct",
     "provider_name":"Meta",
+    "cost":0.4,
+    "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
+    "size":70553706496.0,
     "type":"open-source",
     "license":"Llama3",
     "creation_date":1713312000000,
       "mgsm"
     ]
   },
+  {
+    "id":"meta-llama\/llama-3.1-8b-instruct",
+    "name":"Llama 3.1 8B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
+    "size":8030261248.0,
+    "type":"open-source",
+    "license":"Llama3.1",
+    "creation_date":1721260800000.0,
+    "tasks":null
+  },
+  {
+    "id":"meta-llama\/llama-3.2-1b-instruct",
+    "name":"Llama 3.2 1B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
+    "size":1235814400.0,
+    "type":"open-source",
+    "license":"Llama3.2",
+    "creation_date":1726617600000.0,
+    "tasks":null
+  },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
       "mgsm"
     ]
   },
   {
     "id":"microsoft\/phi-4",
     "name":"Phi 4",
       "mgsm"
     ]
   },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
       "mgsm"
     ]
   },
   {
     "id":"mistralai\/mistral-small-3.1-24b-instruct",
     "name":"Mistral Small 3.1 24B",
       "mgsm"
     ]
   },
   {
     "id":"openai\/gpt-3.5-turbo-0613",
     "name":"GPT-3.5 Turbo (older v0613)",
       "mgsm"
     ]
   },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
       "mgsm"
     ]
   },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
+    "cost":0.0,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
       "truthfulqa",
       "mgsm"
     ]
   }
 ]

pyproject.toml CHANGED Viewed

@@ -36,9 +36,6 @@ dev = [
     "tqdm>=4.67.1",
     "transformers>=4.51.3",
 ]
-cloud = [
-    "google-cloud-storage>=3.2.0",
-]
 [dependency-groups]
 dev = [
@@ -47,10 +44,3 @@ dev = [
     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-[tool.hatch.build.targets.wheel]
-packages = ["evals"]

     "tqdm>=4.67.1",
     "transformers>=4.51.3",
 ]
 [dependency-groups]
 dev = [
     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]

results.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3425d95dc42dec7d32104ca836a9d82d256d6f4ca4cc4971bab4a43339eb3090
-size 15266201

 version https://git-lfs.github.com/spec/v1
+oid sha256:8dbe020a1941a0e49c05f81aeee40ba37d3e2f9f3d83303fcfe1b5711676d1d8
+size 2978273

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff