Spaces:
Running
Running
Upload from GitHub Actions: Add auto-translated datasets
Browse files- .DS_Store +0 -0
- .github/workflows/nightly-evals.yml +0 -4
- .gitignore +0 -3
- Dockerfile +1 -1
- README.md +0 -135
- datasets.json +6 -6
- evals/backend.py +39 -138
- evals/countries.py +4 -9
- evals/datasets_/arc.py +19 -33
- evals/datasets_/mgsm.py +24 -36
- evals/datasets_/mmlu.py +23 -45
- evals/datasets_/truthfulqa.py +26 -53
- evals/datasets_/util.py +0 -7
- evals/main.py +51 -161
- evals/models.py +44 -146
- evals/tasks.py +168 -160
- frontend/src/App.js +77 -183
- frontend/src/components/HistoryPlot.js +2 -2
- frontend/src/components/LanguageTable.js +1 -1
- frontend/src/components/ModelTable.js +17 -31
- frontend/src/components/ScoreColumns.js +10 -23
- frontend/src/components/ScoreField.js +1 -2
- frontend/src/components/SpeakerPlot.js +2 -2
- frontend/src/components/WorldMap.js +7 -22
- languages.json +49 -49
- models.json +226 -432
- pyproject.toml +0 -10
- results.json +2 -2
- uv.lock +0 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
.github/workflows/nightly-evals.yml
CHANGED
|
@@ -8,8 +8,6 @@ on:
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
| 11 |
-
# checking if this is working in case eval runs take longer than 6h github actions allowance
|
| 12 |
-
timeout-minutes: 1440 # 24 hours timeout
|
| 13 |
steps:
|
| 14 |
- uses: actions/checkout@v3
|
| 15 |
|
|
@@ -27,8 +25,6 @@ jobs:
|
|
| 27 |
env:
|
| 28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 30 |
-
N_SENTENCES: 20
|
| 31 |
-
MAX_LANGUAGES: 150
|
| 32 |
run: |
|
| 33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 34 |
uv run evals/download_data.py
|
|
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
|
|
|
|
|
|
| 11 |
steps:
|
| 12 |
- uses: actions/checkout@v3
|
| 13 |
|
|
|
|
| 25 |
env:
|
| 26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
|
|
|
| 28 |
run: |
|
| 29 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 30 |
uv run evals/download_data.py
|
.gitignore
CHANGED
|
@@ -20,6 +20,3 @@ wheels/
|
|
| 20 |
# folders and files to be ignored
|
| 21 |
.specstory/
|
| 22 |
.cursorindexingignore
|
| 23 |
-
|
| 24 |
-
# Project-specific files
|
| 25 |
-
.dockerignore.eval
|
|
|
|
| 20 |
# folders and files to be ignored
|
| 21 |
.specstory/
|
| 22 |
.cursorindexingignore
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
|
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
|
|
| 14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
| 15 |
USER user
|
| 16 |
WORKDIR $HOME/app
|
| 17 |
-
COPY --chown=user pyproject.toml uv.lock
|
| 18 |
RUN uv sync --frozen --no-dev
|
| 19 |
COPY --chown=user evals/ evals/
|
| 20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
|
|
|
| 14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
| 15 |
USER user
|
| 16 |
WORKDIR $HOME/app
|
| 17 |
+
COPY --chown=user pyproject.toml uv.lock ./
|
| 18 |
RUN uv sync --frozen --no-dev
|
| 19 |
COPY --chown=user evals/ evals/
|
| 20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
README.md
CHANGED
|
@@ -43,147 +43,12 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
| 46 |
-
## System Architecture
|
| 47 |
-
|
| 48 |
-
The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
|
| 49 |
-
|
| 50 |
-
```mermaid
|
| 51 |
-
flowchart TD
|
| 52 |
-
%% Model Sources
|
| 53 |
-
A1["important_models<br/>Static Curated List"] --> D[load_models]
|
| 54 |
-
A2["get_historical_popular_models<br/>Web Scraping - Top 20"] --> D
|
| 55 |
-
A3["get_current_popular_models<br/>Web Scraping - Top 10"] --> D
|
| 56 |
-
A4["blocklist<br/>Exclusions"] --> D
|
| 57 |
-
|
| 58 |
-
%% Model Processing
|
| 59 |
-
D --> |"Combine & Dedupe"| E["Dynamic Model List<br/>~40-50 models"]
|
| 60 |
-
E --> |get_or_metadata| F["OpenRouter API<br/>Model Metadata"]
|
| 61 |
-
F --> |get_hf_metadata| G["HuggingFace API<br/>Model Details"]
|
| 62 |
-
G --> H["Enriched Model DataFrame"]
|
| 63 |
-
H --> |Save| I[models.json]
|
| 64 |
-
|
| 65 |
-
%% Model Validation & Cost Filtering
|
| 66 |
-
H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost ≤ $20/1M tokens"]
|
| 67 |
-
H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
|
| 68 |
-
|
| 69 |
-
%% Language Data
|
| 70 |
-
J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
|
| 71 |
-
|
| 72 |
-
%% Task Registry with Unified Prompting
|
| 73 |
-
L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
|
| 74 |
-
M --> M1["translation_from/to<br/>BLEU + ChrF"]
|
| 75 |
-
M --> M2["classification<br/>Accuracy"]
|
| 76 |
-
M --> M3["mmlu<br/>Accuracy"]
|
| 77 |
-
M --> M4["arc<br/>Accuracy"]
|
| 78 |
-
M --> M5["truthfulqa<br/>Accuracy"]
|
| 79 |
-
M --> M6["mgsm<br/>Accuracy"]
|
| 80 |
-
|
| 81 |
-
%% On-the-fly Translation with Origin Tagging
|
| 82 |
-
subgraph OTF [On-the-fly Dataset Translation]
|
| 83 |
-
direction LR
|
| 84 |
-
DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
|
| 85 |
-
Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
|
| 86 |
-
DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
|
| 87 |
-
end
|
| 88 |
-
|
| 89 |
-
%% Evaluation Pipeline
|
| 90 |
-
H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
|
| 91 |
-
K --> |"languages bcp_47"| N
|
| 92 |
-
L --> |"tasks.items"| N
|
| 93 |
-
N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model × Language × Task"]
|
| 94 |
-
O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
|
| 95 |
-
|
| 96 |
-
%% Task Execution with Origin Tracking
|
| 97 |
-
P --> Q1[translate_and_evaluate<br/>Origin: 'human']
|
| 98 |
-
P --> Q2[classify_and_evaluate<br/>Origin: 'human']
|
| 99 |
-
P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
|
| 100 |
-
P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
|
| 101 |
-
P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
|
| 102 |
-
P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
|
| 103 |
-
|
| 104 |
-
%% API Calls with Error Handling
|
| 105 |
-
Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
|
| 106 |
-
Q2 --> |"complete() API<br/>Rate Limiting"| R
|
| 107 |
-
Q3 --> |"complete() API<br/>Rate Limiting"| R
|
| 108 |
-
Q4 --> |"complete() API<br/>Rate Limiting"| R
|
| 109 |
-
Q5 --> |"complete() API<br/>Rate Limiting"| R
|
| 110 |
-
Q6 --> |"complete() API<br/>Rate Limiting"| R
|
| 111 |
-
|
| 112 |
-
%% Results Processing with Origin Aggregation
|
| 113 |
-
R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
|
| 114 |
-
S --> |Save| T[results.json]
|
| 115 |
-
|
| 116 |
-
%% Backend & Frontend with Origin-Specific Metrics
|
| 117 |
-
T --> |Read| U[backend.py]
|
| 118 |
-
I --> |Read| U
|
| 119 |
-
U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
|
| 120 |
-
U --> |make_country_table| W["Country Aggregation"]
|
| 121 |
-
U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
|
| 122 |
-
X --> |"JSON Response"| Y["Frontend React App"]
|
| 123 |
-
|
| 124 |
-
%% UI Components
|
| 125 |
-
Y --> Z1["WorldMap.js<br/>Country Visualization"]
|
| 126 |
-
Y --> Z2["ModelTable.js<br/>Model Rankings"]
|
| 127 |
-
Y --> Z3["LanguageTable.js<br/>Language Coverage"]
|
| 128 |
-
Y --> Z4["DatasetTable.js<br/>Task Performance"]
|
| 129 |
-
|
| 130 |
-
%% Data Sources with Origin Information
|
| 131 |
-
subgraph DS ["Data Sources"]
|
| 132 |
-
DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
|
| 133 |
-
DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
|
| 134 |
-
DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
|
| 135 |
-
DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
|
| 136 |
-
DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
|
| 137 |
-
end
|
| 138 |
-
|
| 139 |
-
DS1 --> Q1
|
| 140 |
-
DS2 --> Q3
|
| 141 |
-
DS3 --> Q4
|
| 142 |
-
DS4 --> Q5
|
| 143 |
-
DS5 --> Q6
|
| 144 |
-
|
| 145 |
-
DS_translated --> Q3
|
| 146 |
-
DS_translated --> Q4
|
| 147 |
-
DS_translated --> Q5
|
| 148 |
-
|
| 149 |
-
DS_native --> Q3
|
| 150 |
-
DS_native --> Q4
|
| 151 |
-
DS_native --> Q5
|
| 152 |
-
|
| 153 |
-
%% Styling - Neutral colors that work in both dark and light modes
|
| 154 |
-
classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
|
| 155 |
-
classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
|
| 156 |
-
classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
|
| 157 |
-
classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
|
| 158 |
-
classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
|
| 159 |
-
classDef translation fill:#d4edda,stroke:#155724,color:#155724
|
| 160 |
-
|
| 161 |
-
class A1,A2,A3,A4 modelSource
|
| 162 |
-
class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
|
| 163 |
-
class R,F,G,X api
|
| 164 |
-
class T,I storage
|
| 165 |
-
class Y,Z1,Z2,Z3,Z4 frontend
|
| 166 |
-
class Google_Translate,DS_translated,DS_native translation
|
| 167 |
-
```
|
| 168 |
-
|
| 169 |
-
**Key Features:**
|
| 170 |
-
- **Model Discovery**: Combines curated models with real-time trending models via web scraping
|
| 171 |
-
- **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
|
| 172 |
-
- **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
|
| 173 |
-
- **Real-time Visualization**: Interactive web interface with country-level insights
|
| 174 |
-
|
| 175 |
## Evaluate
|
| 176 |
|
| 177 |
-
### Local Development
|
| 178 |
```bash
|
| 179 |
uv run --extra dev evals/main.py
|
| 180 |
```
|
| 181 |
|
| 182 |
-
### Google Cloud Deployment
|
| 183 |
-
```bash
|
| 184 |
-
uv run --extra dev evals/main_gcs.py
|
| 185 |
-
```
|
| 186 |
-
|
| 187 |
## Explore
|
| 188 |
|
| 189 |
```bash
|
|
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
## Evaluate
|
| 47 |
|
|
|
|
| 48 |
```bash
|
| 49 |
uv run --extra dev evals/main.py
|
| 50 |
```
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
## Explore
|
| 53 |
|
| 54 |
```bash
|
datasets.json
CHANGED
|
@@ -219,7 +219,7 @@
|
|
| 219 |
"parallel": true,
|
| 220 |
"translation": "machine",
|
| 221 |
"base": "MMLU",
|
| 222 |
-
"implemented":
|
| 223 |
"group": "Multitask Language Understanding"
|
| 224 |
},
|
| 225 |
{
|
|
@@ -256,7 +256,7 @@
|
|
| 256 |
"parallel": true,
|
| 257 |
"translation": "machine",
|
| 258 |
"base": "MMLU",
|
| 259 |
-
"implemented":
|
| 260 |
"group": "Multitask Language Understanding"
|
| 261 |
},
|
| 262 |
{
|
|
@@ -360,7 +360,7 @@
|
|
| 360 |
"parallel": true,
|
| 361 |
"translation": "machine",
|
| 362 |
"base": "AI2 ARC",
|
| 363 |
-
"implemented":
|
| 364 |
"group": "ARC Question Answering"
|
| 365 |
},
|
| 366 |
{
|
|
@@ -375,7 +375,7 @@
|
|
| 375 |
"parallel": true,
|
| 376 |
"translation": "machine",
|
| 377 |
"base": "AI2 ARC",
|
| 378 |
-
"implemented":
|
| 379 |
"group": "ARC Question Answering"
|
| 380 |
},
|
| 381 |
{
|
|
@@ -420,7 +420,7 @@
|
|
| 420 |
"parallel": true,
|
| 421 |
"translation": "machine",
|
| 422 |
"base": "TruthfulQA",
|
| 423 |
-
"implemented":
|
| 424 |
"group": "Truthfulness"
|
| 425 |
},
|
| 426 |
{
|
|
@@ -435,7 +435,7 @@
|
|
| 435 |
"parallel": true,
|
| 436 |
"translation": "machine",
|
| 437 |
"base": "TruthfulQA",
|
| 438 |
-
"implemented":
|
| 439 |
"group": "Truthfulness"
|
| 440 |
},
|
| 441 |
{
|
|
|
|
| 219 |
"parallel": true,
|
| 220 |
"translation": "machine",
|
| 221 |
"base": "MMLU",
|
| 222 |
+
"implemented": true,
|
| 223 |
"group": "Multitask Language Understanding"
|
| 224 |
},
|
| 225 |
{
|
|
|
|
| 256 |
"parallel": true,
|
| 257 |
"translation": "machine",
|
| 258 |
"base": "MMLU",
|
| 259 |
+
"implemented": true,
|
| 260 |
"group": "Multitask Language Understanding"
|
| 261 |
},
|
| 262 |
{
|
|
|
|
| 360 |
"parallel": true,
|
| 361 |
"translation": "machine",
|
| 362 |
"base": "AI2 ARC",
|
| 363 |
+
"implemented": true,
|
| 364 |
"group": "ARC Question Answering"
|
| 365 |
},
|
| 366 |
{
|
|
|
|
| 375 |
"parallel": true,
|
| 376 |
"translation": "machine",
|
| 377 |
"base": "AI2 ARC",
|
| 378 |
+
"implemented": true,
|
| 379 |
"group": "ARC Question Answering"
|
| 380 |
},
|
| 381 |
{
|
|
|
|
| 420 |
"parallel": true,
|
| 421 |
"translation": "machine",
|
| 422 |
"base": "TruthfulQA",
|
| 423 |
+
"implemented": true,
|
| 424 |
"group": "Truthfulness"
|
| 425 |
},
|
| 426 |
{
|
|
|
|
| 435 |
"parallel": true,
|
| 436 |
"translation": "machine",
|
| 437 |
"base": "TruthfulQA",
|
| 438 |
+
"implemented": true,
|
| 439 |
"group": "Truthfulness"
|
| 440 |
},
|
| 441 |
{
|
evals/backend.py
CHANGED
|
@@ -4,18 +4,7 @@ import os
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
| 7 |
-
|
| 8 |
-
# Robust import so this file works both as a package module and as a script
|
| 9 |
-
try:
|
| 10 |
-
# When executed as a package module (recommended): `python -m uvicorn evals.backend:app`
|
| 11 |
-
from .countries import make_country_table
|
| 12 |
-
except Exception:
|
| 13 |
-
try:
|
| 14 |
-
# When executed from project root with package path available
|
| 15 |
-
from evals.countries import make_country_table
|
| 16 |
-
except Exception:
|
| 17 |
-
# When executed directly from evals/ directory
|
| 18 |
-
from countries import make_country_table
|
| 19 |
from fastapi import FastAPI, Request
|
| 20 |
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
from fastapi.middleware.gzip import GZipMiddleware
|
|
@@ -37,7 +26,7 @@ task_metrics = [
|
|
| 37 |
"classification_accuracy",
|
| 38 |
"mmlu_accuracy",
|
| 39 |
"arc_accuracy",
|
| 40 |
-
"truthfulqa_accuracy",
|
| 41 |
"mgsm_accuracy",
|
| 42 |
]
|
| 43 |
|
|
@@ -56,145 +45,66 @@ def compute_normalized_average(df, metrics):
|
|
| 56 |
return normalized_df.mean(axis=1, skipna=False)
|
| 57 |
|
| 58 |
|
| 59 |
-
def make_model_table(
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
# Pivot to get scores for each origin-specific metric
|
| 66 |
-
scores_pivot = scores_df.pivot_table(
|
| 67 |
-
index="model",
|
| 68 |
-
columns="task_metric_origin",
|
| 69 |
-
values="score",
|
| 70 |
-
aggfunc="mean",
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
# Create the regular task_metric for the main average calculation
|
| 74 |
-
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 75 |
-
main_pivot = scores_df.pivot_table(
|
| 76 |
-
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 77 |
)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
df =
|
| 81 |
-
|
| 82 |
for metric in task_metrics:
|
| 83 |
if metric not in df.columns:
|
| 84 |
df[metric] = np.nan
|
| 85 |
-
|
| 86 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 87 |
-
|
| 88 |
-
# Compute origin presence per model+metric
|
| 89 |
-
origin_presence = (
|
| 90 |
-
scores_df.groupby(["model", "task_metric", "origin"]).size().unstack(fill_value=0)
|
| 91 |
-
)
|
| 92 |
-
# Add boolean flags: show asterisk only if exclusively machine-origin contributed
|
| 93 |
-
for metric in task_metrics:
|
| 94 |
-
human_col_name = "human" if "human" in origin_presence.columns else None
|
| 95 |
-
machine_col_name = "machine" if "machine" in origin_presence.columns else None
|
| 96 |
-
if human_col_name or machine_col_name:
|
| 97 |
-
flags = []
|
| 98 |
-
for model in df.index:
|
| 99 |
-
try:
|
| 100 |
-
counts = origin_presence.loc[(model, metric)]
|
| 101 |
-
except KeyError:
|
| 102 |
-
flags.append(False)
|
| 103 |
-
continue
|
| 104 |
-
human_count = counts.get(human_col_name, 0) if human_col_name else 0
|
| 105 |
-
machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
|
| 106 |
-
flags.append(machine_count > 0 and human_count == 0)
|
| 107 |
-
df[f"{metric}_is_machine"] = flags
|
| 108 |
-
else:
|
| 109 |
-
df[f"{metric}_is_machine"] = False
|
| 110 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 111 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 112 |
df["rank"] = df.index + 1
|
| 113 |
-
|
| 114 |
-
# Dynamically find all metric columns to include
|
| 115 |
-
final_cols = df.columns
|
| 116 |
-
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 117 |
-
|
| 118 |
df = df[
|
| 119 |
[
|
| 120 |
-
"rank",
|
| 121 |
-
"
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
]
|
| 124 |
]
|
| 125 |
return df
|
| 126 |
|
| 127 |
|
| 128 |
-
def make_language_table(
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# Pivot to get scores for each origin-specific metric
|
| 135 |
-
scores_pivot = scores_df.pivot_table(
|
| 136 |
-
index="bcp_47",
|
| 137 |
-
columns="task_metric_origin",
|
| 138 |
-
values="score",
|
| 139 |
-
aggfunc="mean",
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
# Create the regular task_metric for the main average calculation
|
| 143 |
-
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 144 |
-
main_pivot = scores_df.pivot_table(
|
| 145 |
-
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
| 146 |
)
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
df =
|
| 150 |
-
|
| 151 |
for metric in task_metrics:
|
| 152 |
if metric not in df.columns:
|
| 153 |
df[metric] = np.nan
|
| 154 |
-
|
| 155 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 156 |
-
|
| 157 |
-
# Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
|
| 158 |
-
origin_presence = (
|
| 159 |
-
scores_df.groupby(["bcp_47", "task_metric", "origin"]).size().unstack(fill_value=0)
|
| 160 |
-
)
|
| 161 |
-
for metric in task_metrics:
|
| 162 |
-
human_col_name = "human" if "human" in origin_presence.columns else None
|
| 163 |
-
machine_col_name = "machine" if "machine" in origin_presence.columns else None
|
| 164 |
-
if human_col_name or machine_col_name:
|
| 165 |
-
flags = []
|
| 166 |
-
for bcp in df.index:
|
| 167 |
-
try:
|
| 168 |
-
counts = origin_presence.loc[(bcp, metric)]
|
| 169 |
-
except KeyError:
|
| 170 |
-
flags.append(False)
|
| 171 |
-
continue
|
| 172 |
-
human_count = counts.get(human_col_name, 0) if human_col_name else 0
|
| 173 |
-
machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
|
| 174 |
-
flags.append(machine_count > 0 and human_count == 0)
|
| 175 |
-
df[f"{metric}_is_machine"] = flags
|
| 176 |
-
else:
|
| 177 |
-
df[f"{metric}_is_machine"] = False
|
| 178 |
-
|
| 179 |
-
# Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
|
| 180 |
-
for metric in task_metrics:
|
| 181 |
-
machine_col = f"{metric}_machine"
|
| 182 |
-
if machine_col in df.columns:
|
| 183 |
-
df[f"{metric}_is_machine"] = df[machine_col].notna()
|
| 184 |
-
else:
|
| 185 |
-
df[f"{metric}_is_machine"] = False
|
| 186 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 187 |
df = df.sort_values(by="speakers", ascending=False)
|
| 188 |
-
|
| 189 |
-
# Dynamically find all metric columns to include
|
| 190 |
-
final_cols = df.columns
|
| 191 |
-
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 192 |
-
|
| 193 |
df = df[
|
| 194 |
[
|
| 195 |
-
"bcp_47",
|
| 196 |
-
"
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
]
|
| 199 |
]
|
| 200 |
return df
|
|
@@ -215,18 +125,10 @@ async def data(request: Request):
|
|
| 215 |
body = await request.body()
|
| 216 |
data = json.loads(body)
|
| 217 |
selected_languages = data.get("selectedLanguages", {})
|
| 218 |
-
df = scores.groupby(["model", "bcp_47", "task", "metric"
|
| 219 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
| 220 |
language_table = make_language_table(df, languages)
|
| 221 |
datasets_df = pd.read_json("datasets.json")
|
| 222 |
-
|
| 223 |
-
# Identify which metrics have machine translations available
|
| 224 |
-
machine_translated_metrics = set()
|
| 225 |
-
for _, row in df.iterrows():
|
| 226 |
-
if row["origin"] == "machine":
|
| 227 |
-
metric_name = f"{row['task']}_{row['metric']}"
|
| 228 |
-
machine_translated_metrics.add(metric_name)
|
| 229 |
-
|
| 230 |
if selected_languages:
|
| 231 |
# the filtering is only applied for the model table and the country data
|
| 232 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
|
@@ -241,7 +143,6 @@ async def data(request: Request):
|
|
| 241 |
"language_table": serialize(language_table),
|
| 242 |
"dataset_table": serialize(datasets_df),
|
| 243 |
"countries": serialize(countries),
|
| 244 |
-
"machine_translated_metrics": list(machine_translated_metrics),
|
| 245 |
}
|
| 246 |
return JSONResponse(content=all_tables)
|
| 247 |
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
| 7 |
+
from countries import make_country_table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from fastapi import FastAPI, Request
|
| 9 |
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
from fastapi.middleware.gzip import GZipMiddleware
|
|
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
| 29 |
+
# "truthfulqa_accuracy",
|
| 30 |
"mgsm_accuracy",
|
| 31 |
]
|
| 32 |
|
|
|
|
| 45 |
return normalized_df.mean(axis=1, skipna=False)
|
| 46 |
|
| 47 |
|
| 48 |
+
def make_model_table(df, models):
|
| 49 |
+
df = (
|
| 50 |
+
df.groupby(["model", "task", "metric"])
|
| 51 |
+
.agg({"score": "mean", "bcp_47": "nunique"})
|
| 52 |
+
.reset_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
)
|
| 54 |
+
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 55 |
+
df = df.drop(columns=["task", "metric"])
|
| 56 |
+
df = df.pivot(index="model", columns="task_metric", values="score")
|
|
|
|
| 57 |
for metric in task_metrics:
|
| 58 |
if metric not in df.columns:
|
| 59 |
df[metric] = np.nan
|
|
|
|
| 60 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 63 |
df["rank"] = df.index + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
df = df[
|
| 65 |
[
|
| 66 |
+
"rank",
|
| 67 |
+
"model",
|
| 68 |
+
"name",
|
| 69 |
+
"provider_name",
|
| 70 |
+
"hf_id",
|
| 71 |
+
"creation_date",
|
| 72 |
+
"size",
|
| 73 |
+
"type",
|
| 74 |
+
"license",
|
| 75 |
+
"cost",
|
| 76 |
+
"average",
|
| 77 |
+
*task_metrics,
|
| 78 |
]
|
| 79 |
]
|
| 80 |
return df
|
| 81 |
|
| 82 |
|
| 83 |
+
def make_language_table(df, languages):
|
| 84 |
+
df = (
|
| 85 |
+
df.groupby(["bcp_47", "task", "metric"])
|
| 86 |
+
.agg({"score": "mean", "model": "nunique"})
|
| 87 |
+
.reset_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
)
|
| 89 |
+
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 90 |
+
df = df.drop(columns=["task", "metric"])
|
| 91 |
+
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
|
|
|
| 92 |
for metric in task_metrics:
|
| 93 |
if metric not in df.columns:
|
| 94 |
df[metric] = np.nan
|
|
|
|
| 95 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 97 |
df = df.sort_values(by="speakers", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
df = df[
|
| 99 |
[
|
| 100 |
+
"bcp_47",
|
| 101 |
+
"language_name",
|
| 102 |
+
"autonym",
|
| 103 |
+
"speakers",
|
| 104 |
+
"family",
|
| 105 |
+
"average",
|
| 106 |
+
"in_benchmark",
|
| 107 |
+
*task_metrics,
|
| 108 |
]
|
| 109 |
]
|
| 110 |
return df
|
|
|
|
| 125 |
body = await request.body()
|
| 126 |
data = json.loads(body)
|
| 127 |
selected_languages = data.get("selectedLanguages", {})
|
| 128 |
+
df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
|
| 129 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
| 130 |
language_table = make_language_table(df, languages)
|
| 131 |
datasets_df = pd.read_json("datasets.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
if selected_languages:
|
| 133 |
# the filtering is only applied for the model table and the country data
|
| 134 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
|
|
|
| 143 |
"language_table": serialize(language_table),
|
| 144 |
"dataset_table": serialize(datasets_df),
|
| 145 |
"countries": serialize(countries),
|
|
|
|
| 146 |
}
|
| 147 |
return JSONResponse(content=all_tables)
|
| 148 |
|
evals/countries.py
CHANGED
|
@@ -30,15 +30,10 @@ def make_country_table(language_table):
|
|
| 30 |
)
|
| 31 |
for country, languages in countries.items():
|
| 32 |
speaker_pop = sum(entry["population"] for entry in languages)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
score = (
|
| 38 |
-
sum(entry["score"] * entry["population"] for entry in languages)
|
| 39 |
-
/ speaker_pop
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
countries[country] = {
|
| 43 |
"score": score,
|
| 44 |
"languages": languages,
|
|
|
|
| 30 |
)
|
| 31 |
for country, languages in countries.items():
|
| 32 |
speaker_pop = sum(entry["population"] for entry in languages)
|
| 33 |
+
score = (
|
| 34 |
+
sum(entry["score"] * entry["population"] for entry in languages)
|
| 35 |
+
/ speaker_pop
|
| 36 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
countries[country] = {
|
| 38 |
"score": score,
|
| 39 |
"languages": languages,
|
evals/datasets_/arc.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
import random
|
|
|
|
| 2 |
|
| 3 |
-
from langcodes import standardize_tag
|
| 4 |
from rich import print
|
| 5 |
-
from models import translate_google,
|
| 6 |
from tqdm import tqdm
|
| 7 |
-
from datasets import
|
| 8 |
import asyncio
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
import os
|
|
@@ -13,33 +14,27 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
|
|
| 13 |
|
| 14 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 15 |
tags_uhura_arc_easy = {
|
| 16 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
| 17 |
-
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
| 20 |
|
| 21 |
|
| 22 |
random.seed(42)
|
| 23 |
-
id_sets_train = [
|
| 24 |
-
set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
|
| 25 |
-
for tag in tags_uhura_arc_easy.values()
|
| 26 |
-
]
|
| 27 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 28 |
random.shuffle(common_ids_train)
|
| 29 |
-
id_sets_test = [
|
| 30 |
-
set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
|
| 31 |
-
for tag in tags_uhura_arc_easy.values()
|
| 32 |
-
]
|
| 33 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 34 |
random.shuffle(common_ids_test)
|
| 35 |
|
| 36 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 37 |
tags_uhura_arc_easy_translated = {
|
| 38 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
| 39 |
-
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
| 40 |
}
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
| 43 |
def add_choices(row):
|
| 44 |
row["choices"] = row["choices"]["text"]
|
| 45 |
return row
|
|
@@ -50,36 +45,27 @@ def load_uhura_arc_easy(language_bcp_47, nr):
|
|
| 50 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 51 |
ds = ds.map(add_choices)
|
| 52 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
| 53 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 54 |
-
return "masakhane/uhura-arc-easy",
|
| 55 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 56 |
-
ds = _load_dataset(
|
| 57 |
-
slug_uhura_arc_easy_translated,
|
| 58 |
-
tags_uhura_arc_easy_translated[language_bcp_47],
|
| 59 |
-
)
|
| 60 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
|
|
|
| 61 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 62 |
-
return "fair-forward/arc-easy-autotranslated",
|
| 63 |
else:
|
| 64 |
return None, None, None
|
| 65 |
|
| 66 |
-
|
| 67 |
-
def load_uhura_arc_challenge(language_bcp_47, nr):
|
| 68 |
-
ds_name = "jlahd/uhura_arc_challenge"
|
| 69 |
-
if language_bcp_47 in _get_dataset_config_names(ds_name):
|
| 70 |
-
ds = _load_dataset(ds_name, language_bcp_47)
|
| 71 |
-
task = ds["test"][nr]
|
| 72 |
-
return ds_name, task
|
| 73 |
-
else:
|
| 74 |
-
return None, None, None
|
| 75 |
-
|
| 76 |
-
|
| 77 |
def translate_arc(languages):
|
| 78 |
human_translated = tags_uhura_arc_easy.keys()
|
| 79 |
untranslated = [
|
| 80 |
lang
|
| 81 |
for lang in languages["bcp_47"].values[:100]
|
| 82 |
-
if lang not in human_translated and lang in
|
| 83 |
]
|
| 84 |
n_samples = 10
|
| 85 |
train_ids = common_ids_train[:n_samples+3]
|
|
|
|
| 1 |
import random
|
| 2 |
+
from collections import Counter, defaultdict
|
| 3 |
|
| 4 |
+
from langcodes import Language, standardize_tag
|
| 5 |
from rich import print
|
| 6 |
+
from models import translate_google, google_supported_languages
|
| 7 |
from tqdm import tqdm
|
| 8 |
+
from datasets import Dataset, load_dataset
|
| 9 |
import asyncio
|
| 10 |
from tqdm.asyncio import tqdm_asyncio
|
| 11 |
import os
|
|
|
|
| 14 |
|
| 15 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 16 |
tags_uhura_arc_easy = {
|
| 17 |
+
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
|
|
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
| 20 |
|
| 21 |
|
| 22 |
random.seed(42)
|
| 23 |
+
id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
|
|
|
|
|
|
|
|
|
|
| 24 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 25 |
random.shuffle(common_ids_train)
|
| 26 |
+
id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
|
|
|
|
|
|
|
|
|
|
| 27 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 28 |
random.shuffle(common_ids_test)
|
| 29 |
|
| 30 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 31 |
tags_uhura_arc_easy_translated = {
|
| 32 |
+
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
|
| 36 |
+
|
| 37 |
+
|
| 38 |
def add_choices(row):
|
| 39 |
row["choices"] = row["choices"]["text"]
|
| 40 |
return row
|
|
|
|
| 45 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 46 |
ds = ds.map(add_choices)
|
| 47 |
ds = ds.rename_column("answerKey", "answer")
|
| 48 |
+
train_ids = common_ids_train[nr:nr+3]
|
| 49 |
+
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 50 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 51 |
+
return "masakhane/uhura-arc-easy", examples, task
|
| 52 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 53 |
+
ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
|
|
|
|
|
|
|
|
|
|
| 54 |
ds = ds.rename_column("answerKey", "answer")
|
| 55 |
+
train_ids = common_ids_train[nr:nr+3]
|
| 56 |
+
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 57 |
+
# raise Exception(language_bcp_47)
|
| 58 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 59 |
+
return "fair-forward/arc-easy-autotranslated", examples, task
|
| 60 |
else:
|
| 61 |
return None, None, None
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def translate_arc(languages):
|
| 64 |
human_translated = tags_uhura_arc_easy.keys()
|
| 65 |
untranslated = [
|
| 66 |
lang
|
| 67 |
for lang in languages["bcp_47"].values[:100]
|
| 68 |
+
if lang not in human_translated and lang in google_supported_languages
|
| 69 |
]
|
| 70 |
n_samples = 10
|
| 71 |
train_ids = common_ids_train[:n_samples+3]
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
| 3 |
-
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 7 |
-
from langcodes import
|
| 8 |
-
from models import
|
| 9 |
-
from rich import print
|
| 10 |
from tqdm import tqdm
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
|
|
@@ -39,41 +37,31 @@ def parse_number(i):
|
|
| 39 |
return None
|
| 40 |
|
| 41 |
|
| 42 |
-
@cache
|
| 43 |
-
def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
|
| 44 |
-
"""Cache individual MGSM items efficiently"""
|
| 45 |
-
try:
|
| 46 |
-
ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
|
| 47 |
-
if nr >= len(ds):
|
| 48 |
-
return None
|
| 49 |
-
|
| 50 |
-
row = ds[nr]
|
| 51 |
-
|
| 52 |
-
# Post-process based on dataset type
|
| 53 |
-
if dataset_slug == slug_gsm8kx:
|
| 54 |
-
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 55 |
-
|
| 56 |
-
return row
|
| 57 |
-
except Exception:
|
| 58 |
-
# Dataset doesn't exist or doesn't have test split
|
| 59 |
-
return None
|
| 60 |
-
|
| 61 |
-
|
| 62 |
def load_mgsm(language_bcp_47, nr):
|
| 63 |
if language_bcp_47 in tags_mgsm.keys():
|
| 64 |
-
|
| 65 |
-
return slug_mgsm,
|
| 66 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
return slug_gsm8kx, item, "machine" if item else (None, None, None)
|
| 72 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
-
return None, None
|
| 77 |
|
| 78 |
|
| 79 |
def translate_mgsm(languages):
|
|
@@ -81,7 +69,7 @@ def translate_mgsm(languages):
|
|
| 81 |
untranslated = [
|
| 82 |
lang
|
| 83 |
for lang in languages["bcp_47"].values[:100]
|
| 84 |
-
if lang not in human_translated and lang in
|
| 85 |
]
|
| 86 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 87 |
slug = "fair-forward/gsm-autotranslated"
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
from datasets import Dataset, load_dataset
|
| 5 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 6 |
+
from langcodes import standardize_tag
|
| 7 |
+
from models import google_supported_languages, translate_google
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
|
|
|
|
| 37 |
return None
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def load_mgsm(language_bcp_47, nr):
|
| 41 |
if language_bcp_47 in tags_mgsm.keys():
|
| 42 |
+
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
| 43 |
+
return slug_mgsm, ds[nr]
|
| 44 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 45 |
+
ds = _load_dataset(
|
| 46 |
+
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 47 |
+
)
|
| 48 |
+
return slug_afrimgsm, ds[nr]
|
|
|
|
| 49 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 50 |
+
ds = _load_dataset(
|
| 51 |
+
slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
|
| 52 |
+
)
|
| 53 |
+
return slug_gsm_autotranslated, ds[nr]
|
| 54 |
+
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 55 |
+
row = _load_dataset(
|
| 56 |
+
slug_gsm8kx,
|
| 57 |
+
subset=tags_gsm8kx[language_bcp_47],
|
| 58 |
+
split="test",
|
| 59 |
+
trust_remote_code=True,
|
| 60 |
+
)[nr]
|
| 61 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 62 |
+
return slug_gsm8kx, row
|
| 63 |
else:
|
| 64 |
+
return None, None
|
| 65 |
|
| 66 |
|
| 67 |
def translate_mgsm(languages):
|
|
|
|
| 69 |
untranslated = [
|
| 70 |
lang
|
| 71 |
for lang in languages["bcp_47"].values[:100]
|
| 72 |
+
if lang not in human_translated and lang in google_supported_languages
|
| 73 |
]
|
| 74 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 75 |
slug = "fair-forward/gsm-autotranslated"
|
evals/datasets_/mmlu.py
CHANGED
|
@@ -4,9 +4,9 @@ import random
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
-
from models import
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
@@ -111,7 +111,6 @@ def print_datasets_analysis():
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
| 114 |
-
|
| 115 |
# print_datasets_analysis()
|
| 116 |
|
| 117 |
|
|
@@ -144,51 +143,32 @@ tags_mmlux = set(
|
|
| 144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
-
tags_mmlu_autotranslated =
|
| 148 |
-
standardize_tag(a, macro=True): a
|
| 149 |
-
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
| 150 |
-
}
|
| 151 |
|
| 152 |
categories = sorted(
|
| 153 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
| 154 |
)
|
| 155 |
|
| 156 |
|
| 157 |
-
|
| 158 |
-
def _get_processed_mmlu_dataset(dataset_name, subset_tag):
|
| 159 |
-
"""Cache processed datasets to avoid reprocessing"""
|
| 160 |
-
ds = _load_dataset(dataset_name, subset_tag)
|
| 161 |
-
if dataset_name == "masakhane/afrimmlu":
|
| 162 |
-
ds = ds.map(parse_choices)
|
| 163 |
-
elif dataset_name == "CohereForAI/Global-MMLU":
|
| 164 |
-
ds = ds.map(add_choices)
|
| 165 |
-
return ds
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
@cache
|
| 169 |
-
def _get_mmlu_item(dataset_name, subset_tag, category, nr):
|
| 170 |
-
"""Cache individual MMLU items efficiently"""
|
| 171 |
-
ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
|
| 172 |
-
if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
|
| 173 |
-
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 174 |
-
return filtered[nr] if nr < len(filtered) else None
|
| 175 |
-
else: # fair-forward/mmlu-autotranslated
|
| 176 |
-
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 177 |
-
return filtered[nr] if nr < len(filtered) else None
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
async def load_mmlu(language_bcp_47, nr):
|
| 181 |
category = categories[nr % len(categories)]
|
| 182 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
else:
|
| 193 |
return None, None, None
|
| 194 |
|
|
@@ -197,10 +177,10 @@ def translate_mmlu(languages):
|
|
| 197 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
| 198 |
untranslated = [
|
| 199 |
lang
|
| 200 |
-
for lang in languages["bcp_47"].values[:
|
| 201 |
-
if lang not in human_translated and lang in
|
| 202 |
]
|
| 203 |
-
n_samples =
|
| 204 |
|
| 205 |
slug = "fair-forward/mmlu-autotranslated"
|
| 206 |
for lang in tqdm(untranslated):
|
|
@@ -216,10 +196,8 @@ def translate_mmlu(languages):
|
|
| 216 |
if split == "dev":
|
| 217 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 218 |
else:
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
for i in range(min(n_samples, len(filtered))):
|
| 222 |
-
task = filtered[i]
|
| 223 |
samples.append(task)
|
| 224 |
questions_tr = [
|
| 225 |
translate_google(s["question"], "en", lang) for s in samples
|
|
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
+
from models import google_supported_languages, translate_google
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
|
|
|
| 114 |
# print_datasets_analysis()
|
| 115 |
|
| 116 |
|
|
|
|
| 143 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 144 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 145 |
)
|
| 146 |
+
tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
categories = sorted(
|
| 149 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
| 150 |
)
|
| 151 |
|
| 152 |
|
| 153 |
+
def load_mmlu(language_bcp_47, nr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
category = categories[nr % len(categories)]
|
| 155 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 156 |
+
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
| 157 |
+
ds = ds.map(parse_choices)
|
| 158 |
+
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 159 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 160 |
+
return "masakhane/afrimmlu", examples, task
|
| 161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 162 |
+
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
| 163 |
+
ds = ds.map(add_choices)
|
| 164 |
+
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 165 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 166 |
+
return "CohereForAI/Global-MMLU", examples, task
|
| 167 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 168 |
+
ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
|
| 169 |
+
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 170 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 171 |
+
return "fair-forward/mmlu-autotranslated", examples, task
|
| 172 |
else:
|
| 173 |
return None, None, None
|
| 174 |
|
|
|
|
| 177 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
| 178 |
untranslated = [
|
| 179 |
lang
|
| 180 |
+
for lang in languages["bcp_47"].values[:100]
|
| 181 |
+
if lang not in human_translated and lang in google_supported_languages
|
| 182 |
]
|
| 183 |
+
n_samples = 10
|
| 184 |
|
| 185 |
slug = "fair-forward/mmlu-autotranslated"
|
| 186 |
for lang in tqdm(untranslated):
|
|
|
|
| 196 |
if split == "dev":
|
| 197 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 198 |
else:
|
| 199 |
+
for i in range(n_samples):
|
| 200 |
+
task = ds.filter(lambda x: x["subject"] == category)[i]
|
|
|
|
|
|
|
| 201 |
samples.append(task)
|
| 202 |
questions_tr = [
|
| 203 |
translate_google(s["question"], "en", lang) for s in samples
|
evals/datasets_/truthfulqa.py
CHANGED
|
@@ -9,26 +9,16 @@ from tqdm.asyncio import tqdm_asyncio
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
-
from models import translate_google,
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
| 17 |
-
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
|
| 18 |
-
|
| 19 |
tags_uhura_truthfulqa = {
|
| 20 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 21 |
if a.endswith("multiple_choice")
|
| 22 |
}
|
| 23 |
|
| 24 |
-
# Get available auto-translated languages
|
| 25 |
-
try:
|
| 26 |
-
tags_truthfulqa_autotranslated = {
|
| 27 |
-
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
|
| 28 |
-
}
|
| 29 |
-
except Exception:
|
| 30 |
-
tags_truthfulqa_autotranslated = {}
|
| 31 |
-
|
| 32 |
|
| 33 |
def add_choices(row):
|
| 34 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
@@ -36,36 +26,27 @@ def add_choices(row):
|
|
| 36 |
return row
|
| 37 |
|
| 38 |
|
| 39 |
-
|
| 40 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 41 |
-
ds = _load_dataset(
|
| 42 |
-
slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
|
| 43 |
-
)
|
| 44 |
ds = ds.map(add_choices)
|
|
|
|
| 45 |
task = ds["test"][nr]
|
| 46 |
-
return "masakhane/uhura-truthfulqa",
|
| 47 |
-
elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
|
| 48 |
-
# Load from auto-translated dataset (same samples as translation)
|
| 49 |
-
ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
|
| 50 |
-
test_split = ds["test"] if "test" in ds else ds
|
| 51 |
-
task = test_split[nr]
|
| 52 |
-
return slug_truthfulqa_autotranslated, task, "machine"
|
| 53 |
-
# TODO: add Okapi, TruthfulQA-X @Jonas
|
| 54 |
else:
|
| 55 |
return None, None, None
|
| 56 |
|
|
|
|
|
|
|
| 57 |
def translate_truthfulqa(languages):
|
| 58 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
| 59 |
untranslated = [
|
| 60 |
lang
|
| 61 |
-
for lang in languages["bcp_47"].values[:
|
| 62 |
-
if lang not in human_translated and lang in
|
| 63 |
]
|
| 64 |
-
n_samples =
|
| 65 |
|
| 66 |
-
# Set fixed seed for consistent sample selection across all languages
|
| 67 |
-
random.seed(42)
|
| 68 |
-
|
| 69 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 70 |
for lang in tqdm(untranslated):
|
| 71 |
# check if already exists on hub
|
|
@@ -79,40 +60,32 @@ def translate_truthfulqa(languages):
|
|
| 79 |
if split == "train":
|
| 80 |
samples.extend(ds)
|
| 81 |
else:
|
| 82 |
-
|
| 83 |
-
for i in range(min(n_samples, len(ds))):
|
| 84 |
task = ds[i]
|
| 85 |
samples.append(task)
|
| 86 |
-
|
| 87 |
-
# Translate questions
|
| 88 |
questions_tr = [
|
| 89 |
translate_google(s["question"], "en", lang) for s in samples
|
| 90 |
]
|
| 91 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 92 |
-
|
| 93 |
-
# Translate choices for each sample
|
| 94 |
-
all_choices_tr = []
|
| 95 |
-
all_labels = []
|
| 96 |
-
|
| 97 |
for s in samples:
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
choices_tr
|
| 107 |
-
|
| 108 |
-
all_choices_tr.append(choices_tr)
|
| 109 |
-
all_labels.append(labels)
|
| 110 |
|
| 111 |
ds_lang = Dataset.from_dict(
|
| 112 |
{
|
|
|
|
| 113 |
"question": questions_tr,
|
| 114 |
-
"choices":
|
| 115 |
-
"
|
| 116 |
}
|
| 117 |
)
|
| 118 |
ds_lang.push_to_hub(
|
|
@@ -122,7 +95,7 @@ def translate_truthfulqa(languages):
|
|
| 122 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 123 |
)
|
| 124 |
ds_lang.to_json(
|
| 125 |
-
f"data/translations/
|
| 126 |
lines=False,
|
| 127 |
force_ascii=False,
|
| 128 |
indent=2,
|
|
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
+
from models import translate_google, google_supported_languages
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
|
|
|
|
|
|
| 17 |
tags_uhura_truthfulqa = {
|
| 18 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 19 |
if a.endswith("multiple_choice")
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def add_choices(row):
|
| 24 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
|
|
| 26 |
return row
|
| 27 |
|
| 28 |
|
| 29 |
+
def load_truthfulqa(language_bcp_47, nr):
|
| 30 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 31 |
+
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
|
|
|
|
|
|
|
| 32 |
ds = ds.map(add_choices)
|
| 33 |
+
examples = ds["train"]
|
| 34 |
task = ds["test"][nr]
|
| 35 |
+
return "masakhane/uhura-truthfulqa", examples, task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
else:
|
| 37 |
return None, None, None
|
| 38 |
|
| 39 |
+
|
| 40 |
+
|
| 41 |
def translate_truthfulqa(languages):
|
| 42 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
| 43 |
untranslated = [
|
| 44 |
lang
|
| 45 |
+
for lang in languages["bcp_47"].values[:100]
|
| 46 |
+
if lang not in human_translated and lang in google_supported_languages
|
| 47 |
]
|
| 48 |
+
n_samples = 10
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 51 |
for lang in tqdm(untranslated):
|
| 52 |
# check if already exists on hub
|
|
|
|
| 60 |
if split == "train":
|
| 61 |
samples.extend(ds)
|
| 62 |
else:
|
| 63 |
+
for i in range(n_samples):
|
|
|
|
| 64 |
task = ds[i]
|
| 65 |
samples.append(task)
|
|
|
|
|
|
|
| 66 |
questions_tr = [
|
| 67 |
translate_google(s["question"], "en", lang) for s in samples
|
| 68 |
]
|
| 69 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 70 |
+
choices_texts_concatenated = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
for s in samples:
|
| 72 |
+
for choice in eval(s["choices"]):
|
| 73 |
+
choices_texts_concatenated.append(choice)
|
| 74 |
+
choices_tr = [
|
| 75 |
+
translate_google(c, "en", lang) for c in choices_texts_concatenated
|
| 76 |
+
]
|
| 77 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
| 78 |
+
# group into chunks of 4
|
| 79 |
+
choices_tr = [
|
| 80 |
+
choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
|
| 81 |
+
]
|
|
|
|
|
|
|
| 82 |
|
| 83 |
ds_lang = Dataset.from_dict(
|
| 84 |
{
|
| 85 |
+
"subject": [s["subject"] for s in samples],
|
| 86 |
"question": questions_tr,
|
| 87 |
+
"choices": choices_tr,
|
| 88 |
+
"answer": [s["answer"] for s in samples],
|
| 89 |
}
|
| 90 |
)
|
| 91 |
ds_lang.push_to_hub(
|
|
|
|
| 95 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 96 |
)
|
| 97 |
ds_lang.to_json(
|
| 98 |
+
f"data/translations/mmlu/{lang}_{split}.json",
|
| 99 |
lines=False,
|
| 100 |
force_ascii=False,
|
| 101 |
indent=2,
|
evals/datasets_/util.py
CHANGED
|
@@ -12,10 +12,3 @@ def _get_dataset_config_names(dataset, **kwargs):
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
| 15 |
-
|
| 16 |
-
# Cache individual dataset items to avoid reloading entire datasets
|
| 17 |
-
@cache
|
| 18 |
-
def _get_dataset_item(dataset, subset, split, index, **kwargs):
|
| 19 |
-
"""Load a single item from a dataset efficiently"""
|
| 20 |
-
ds = load_dataset(dataset, subset, split=split, **kwargs)
|
| 21 |
-
return ds[index] if index < len(ds) else None
|
|
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/main.py
CHANGED
|
@@ -1,172 +1,62 @@
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
-
import
|
| 4 |
-
from datetime import datetime, timedelta
|
| 5 |
from models import models
|
| 6 |
from tasks import tasks
|
| 7 |
-
from
|
| 8 |
-
import os
|
| 9 |
|
| 10 |
-
|
| 11 |
-
# Configuration - easily adjustable defaults
|
| 12 |
-
n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
|
| 13 |
-
max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
|
| 14 |
-
single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
|
| 15 |
-
test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
|
| 16 |
-
|
| 17 |
-
# Keep original DataFrames for saving metadata - distinction added for single model test runs.
|
| 18 |
-
original_models_df = pd.DataFrame(models)
|
| 19 |
-
original_languages_df = pd.DataFrame(languages)
|
| 20 |
-
|
| 21 |
-
# Create working copies for single evaluation runs
|
| 22 |
-
models_df = original_models_df.copy()
|
| 23 |
-
languages_df = original_languages_df.copy()
|
| 24 |
-
top_languages = languages.head(max_languages)
|
| 25 |
-
|
| 26 |
-
# Filter to single model if specified (only affects evaluation, not saving)
|
| 27 |
-
if single_model:
|
| 28 |
-
models_df = models_df[models_df["id"] == single_model]
|
| 29 |
-
if len(models_df) == 0:
|
| 30 |
-
print(f"Error: Model '{single_model}' not found. Available models:")
|
| 31 |
-
for model_id in original_models_df["id"]:
|
| 32 |
-
print(f" {model_id}")
|
| 33 |
-
return pd.DataFrame()
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
start_time = time.time()
|
| 39 |
-
|
| 40 |
-
# Load existing results to avoid re-evaluation (skip in test mode)
|
| 41 |
-
if test_mode:
|
| 42 |
-
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 43 |
-
else:
|
| 44 |
-
try:
|
| 45 |
-
old_results = pd.read_json("results.json")
|
| 46 |
-
if old_results.empty:
|
| 47 |
-
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 48 |
-
except FileNotFoundError:
|
| 49 |
-
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 50 |
-
|
| 51 |
-
# Get all combinations that need evaluation
|
| 52 |
-
combis = [
|
| 53 |
-
(model, lang.bcp_47, task_name)
|
| 54 |
-
for model in models_df["id"]
|
| 55 |
-
for lang in top_languages.itertuples()
|
| 56 |
-
for task_name, task in tasks.items()
|
| 57 |
-
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
| 58 |
-
]
|
| 59 |
-
|
| 60 |
-
# Filter out already evaluated combinations
|
| 61 |
-
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 62 |
-
if not old_results.empty:
|
| 63 |
-
completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
|
| 64 |
-
# set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
|
| 65 |
-
mask = ~combis.apply(lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1)
|
| 66 |
-
combis = combis[mask]
|
| 67 |
-
|
| 68 |
-
# Create all evaluation tasks
|
| 69 |
-
all_tasks = []
|
| 70 |
-
for i in range(n_sentences):
|
| 71 |
-
for model, bcp_47, task_name in combis.itertuples(index=False):
|
| 72 |
-
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
| 73 |
-
|
| 74 |
-
print(f"Running {len(all_tasks)} evaluation tasks...")
|
| 75 |
-
|
| 76 |
-
# For single model runs, we stop immediately on first API error to inspect.
|
| 77 |
-
# For full evaluations, we continue despite errors to get maximum coverage.
|
| 78 |
-
stop_on_error = single_model is not None
|
| 79 |
-
|
| 80 |
-
# Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
|
| 81 |
-
batch_size = 1000
|
| 82 |
-
all_results = []
|
| 83 |
-
|
| 84 |
-
try:
|
| 85 |
-
for i in range(0, len(all_tasks), batch_size):
|
| 86 |
-
batch = all_tasks[i:i + batch_size]
|
| 87 |
-
batch_results = await asyncio.gather(
|
| 88 |
-
*[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in batch],
|
| 89 |
-
return_exceptions=not stop_on_error
|
| 90 |
-
)
|
| 91 |
-
all_results.extend(batch_results)
|
| 92 |
-
|
| 93 |
-
results = all_results
|
| 94 |
-
|
| 95 |
-
# Process results and logging API errors separately to understand what are the main issues.
|
| 96 |
-
valid_results = []
|
| 97 |
-
errors = []
|
| 98 |
-
|
| 99 |
-
for i, r in enumerate(results):
|
| 100 |
-
if isinstance(r, Exception):
|
| 101 |
-
if i < len(all_tasks):
|
| 102 |
-
task_info = all_tasks[i]
|
| 103 |
-
errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
|
| 104 |
-
elif isinstance(r, list):
|
| 105 |
-
valid_results.extend(r)
|
| 106 |
-
elif r is not None:
|
| 107 |
-
valid_results.append(r)
|
| 108 |
-
|
| 109 |
-
# log errors and store
|
| 110 |
-
if errors:
|
| 111 |
-
with open("errors.log", "w") as f:
|
| 112 |
-
f.write("model,task,error\n")
|
| 113 |
-
for error in errors:
|
| 114 |
-
f.write(error + "\n")
|
| 115 |
-
|
| 116 |
-
# Track model completion (TO BE DELETED - was for local run only)
|
| 117 |
-
if valid_results:
|
| 118 |
-
completed_models = set()
|
| 119 |
-
for result in valid_results:
|
| 120 |
-
if isinstance(result, dict) and "model" in result:
|
| 121 |
-
model = result["model"]
|
| 122 |
-
if model not in completed_models:
|
| 123 |
-
completed_models.add(model)
|
| 124 |
-
print(f"Completed: {model}")
|
| 125 |
|
| 126 |
-
print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
|
| 127 |
-
|
| 128 |
-
# this is for local single model runs - for testing and development
|
| 129 |
-
except Exception as e:
|
| 130 |
-
print(f"EVALUATION STOPPED - API Error occurred:")
|
| 131 |
-
print(f"Error type: {type(e).__name__}")
|
| 132 |
-
print(f"Error message: {str(e)}")
|
| 133 |
-
return pd.DataFrame()
|
| 134 |
-
|
| 135 |
-
# Save results (skipped in test mode as we do not want to overwrite existing results)
|
| 136 |
-
if valid_results:
|
| 137 |
-
results_df = pd.DataFrame(valid_results)
|
| 138 |
-
|
| 139 |
-
# Aggregate results
|
| 140 |
-
results_df = (
|
| 141 |
-
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 142 |
-
.agg({"score": "mean"})
|
| 143 |
-
.reset_index()
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
if not test_mode:
|
| 147 |
-
args = dict(orient="records", indent=2, force_ascii=False)
|
| 148 |
-
|
| 149 |
-
# Merge with existing results
|
| 150 |
-
if not old_results.empty:
|
| 151 |
-
results_df = pd.concat([old_results, results_df])
|
| 152 |
-
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 153 |
-
|
| 154 |
-
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 155 |
-
results_df.to_json("results.json", **args)
|
| 156 |
-
|
| 157 |
-
# Save model and language info (always save complete metadata, not filtered)
|
| 158 |
-
original_models_df.to_json("models.json", **args)
|
| 159 |
-
original_languages_df.to_json("languages.json", **args)
|
| 160 |
-
else:
|
| 161 |
-
print("TEST MODE: Skipping results saving")
|
| 162 |
-
|
| 163 |
-
elapsed = time.time() - start_time
|
| 164 |
-
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
|
| 165 |
-
|
| 166 |
-
return results_df
|
| 167 |
-
|
| 168 |
-
return pd.DataFrame()
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
|
| 3 |
import pandas as pd
|
| 4 |
+
from languages import languages
|
|
|
|
| 5 |
from models import models
|
| 6 |
from tasks import tasks
|
| 7 |
+
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
| 8 |
|
| 9 |
+
# ===== config =====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
n_sentences = 10
|
| 12 |
+
|
| 13 |
+
# ===== run evaluation and aggregate results =====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
async def evaluate():
|
| 17 |
+
# FIXME we should not need this for-loop, but it helps
|
| 18 |
+
for n_languages in range(10, 101, 10):
|
| 19 |
+
print(f"running evaluations for {n_languages} languages")
|
| 20 |
+
old_results = pd.read_json("results.json")
|
| 21 |
+
old_models = pd.read_json("models.json")
|
| 22 |
+
# get all combinations of model, language and task
|
| 23 |
+
combis = [
|
| 24 |
+
(model, lang.bcp_47, task_name)
|
| 25 |
+
for model in models["id"]
|
| 26 |
+
for lang in languages.iloc[:n_languages].itertuples()
|
| 27 |
+
for task_name, task in tasks.items()
|
| 28 |
+
if task_name in models[models["id"] == model]["tasks"].iloc[0]
|
| 29 |
+
]
|
| 30 |
+
# filter out combinations that have already been evaluated
|
| 31 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 32 |
+
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 33 |
+
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 34 |
+
# run evaluations
|
| 35 |
+
results = [
|
| 36 |
+
tasks[task_name](model, bcp_47, i)
|
| 37 |
+
for i in range(n_sentences)
|
| 38 |
+
for model, bcp_47, task_name in combis.itertuples(index=False)
|
| 39 |
+
]
|
| 40 |
+
results = await tqdm_asyncio.gather(*results, miniters=1)
|
| 41 |
+
results = [r for group in results for r in group]
|
| 42 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 43 |
+
if results:
|
| 44 |
+
# aggregate results
|
| 45 |
+
results = pd.DataFrame(results)
|
| 46 |
+
results = (
|
| 47 |
+
results.groupby(["model", "bcp_47", "task", "metric"])
|
| 48 |
+
.agg({"score": "mean"})
|
| 49 |
+
.reset_index()
|
| 50 |
+
)
|
| 51 |
+
# save results
|
| 52 |
+
results = pd.concat([old_results, results])
|
| 53 |
+
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 54 |
+
results.to_json("results.json", **args)
|
| 55 |
+
# save up-to-date info on models and languages
|
| 56 |
+
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 57 |
+
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 58 |
+
all_models.to_json("models.json", **args)
|
| 59 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 60 |
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
from collections import defaultdict
|
|
@@ -8,11 +7,7 @@ from os import getenv
|
|
| 8 |
import pandas as pd
|
| 9 |
from aiolimiter import AsyncLimiter
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
-
|
| 12 |
-
try:
|
| 13 |
-
from elevenlabs import AsyncElevenLabs
|
| 14 |
-
except Exception: # ImportError or other env-specific issues
|
| 15 |
-
AsyncElevenLabs = None
|
| 16 |
from google.cloud import translate_v2 as translate
|
| 17 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 18 |
from joblib.memory import Memory
|
|
@@ -27,17 +22,14 @@ important_models = [
|
|
| 27 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 28 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 29 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 30 |
-
"openai/gpt-5",
|
| 31 |
-
"openai/gpt-5-nano", # include if/when available
|
| 32 |
"openai/gpt-4.1", # 8$
|
| 33 |
"openai/gpt-4.1-mini", # 1.6$
|
| 34 |
"openai/gpt-4.1-nano", # 0.4$
|
| 35 |
"openai/gpt-4o-mini", # 0.6$
|
| 36 |
-
"openai/gpt-4o-2024-11-20", # 10$
|
| 37 |
-
"openai/gpt-
|
| 38 |
-
"
|
| 39 |
-
"anthropic/claude-
|
| 40 |
-
"anthropic/claude-opus-4.1", # 15$ - added for full coverage
|
| 41 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 42 |
"mistralai/mistral-saba", # 0.6$
|
| 43 |
"mistralai/mistral-nemo", # 0.08$
|
|
@@ -56,13 +48,10 @@ important_models = [
|
|
| 56 |
"microsoft/phi-4", # 0.07$
|
| 57 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
| 58 |
"amazon/nova-micro-v1", # 0.09$
|
| 59 |
-
"moonshotai/kimi-k2", # 0.6$ - added to prevent missing from models.json
|
| 60 |
-
"x-ai/grok-4"
|
| 61 |
]
|
| 62 |
|
| 63 |
blocklist = [
|
| 64 |
"google/gemini-2.5-pro-preview",
|
| 65 |
-
"google/gemini-2.5-pro",
|
| 66 |
"google/gemini-2.5-flash-preview",
|
| 67 |
"google/gemini-2.5-flash-lite-preview",
|
| 68 |
"google/gemini-2.5-flash-preview-04-17",
|
|
@@ -70,7 +59,6 @@ blocklist = [
|
|
| 70 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
| 71 |
"google/gemini-2.5-pro-preview-06-05",
|
| 72 |
"google/gemini-2.5-pro-preview-05-06",
|
| 73 |
-
"perplexity/sonar-deep-research"
|
| 74 |
]
|
| 75 |
|
| 76 |
transcription_models = [
|
|
@@ -97,82 +85,36 @@ def get_model(permaslug):
|
|
| 97 |
and m["endpoint"]
|
| 98 |
and not m["endpoint"]["is_free"]
|
| 99 |
]
|
|
|
|
|
|
|
|
|
|
| 100 |
return slugs[0] if len(slugs) >= 1 else None
|
| 101 |
|
| 102 |
|
| 103 |
@cache
|
| 104 |
def get_historical_popular_models(date: date):
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
for count_str, model_slug in matches:
|
| 121 |
-
count = float(count_str)
|
| 122 |
-
if not model_slug.startswith('openrouter') and model_slug != 'Others':
|
| 123 |
-
# Remove variant suffixes for aggregation
|
| 124 |
-
base_model = model_slug.split(':')[0]
|
| 125 |
-
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
| 126 |
-
|
| 127 |
-
# Sort by popularity and return top models
|
| 128 |
-
sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
|
| 129 |
-
result = []
|
| 130 |
-
for model_slug, count in sorted_models[:20]: # Top 20
|
| 131 |
-
result.append({"slug": model_slug, "count": int(count)})
|
| 132 |
-
|
| 133 |
-
return result
|
| 134 |
-
else:
|
| 135 |
-
return []
|
| 136 |
-
|
| 137 |
-
except Exception as e:
|
| 138 |
-
return []
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
@cache
|
| 142 |
def get_current_popular_models(date: date):
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
# Find all count and model_permaslug pairs in the daily data
|
| 151 |
-
pattern = r'\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"'
|
| 152 |
-
matches = re.findall(pattern, raw)
|
| 153 |
-
|
| 154 |
-
if matches:
|
| 155 |
-
# Aggregate model counts
|
| 156 |
-
model_counts = {}
|
| 157 |
-
for count_str, model_slug in matches:
|
| 158 |
-
count = float(count_str)
|
| 159 |
-
if not model_slug.startswith('openrouter') and model_slug != 'Others':
|
| 160 |
-
# Remove variant suffixes for aggregation
|
| 161 |
-
base_model = model_slug.split(':')[0]
|
| 162 |
-
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
| 163 |
-
|
| 164 |
-
# Sort by popularity and return top models
|
| 165 |
-
sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
|
| 166 |
-
result = []
|
| 167 |
-
for model_slug, count in sorted_models[:10]: # Top 10
|
| 168 |
-
result.append({"slug": model_slug, "count": int(count)})
|
| 169 |
-
|
| 170 |
-
return result
|
| 171 |
-
else:
|
| 172 |
-
return []
|
| 173 |
-
|
| 174 |
-
except Exception as e:
|
| 175 |
-
return []
|
| 176 |
|
| 177 |
|
| 178 |
def get_translation_models():
|
|
@@ -206,52 +148,26 @@ google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
|
|
| 206 |
|
| 207 |
@cache
|
| 208 |
async def complete(**kwargs) -> str | None:
|
| 209 |
-
# Add longer timeout for slower, premium, or reasoning-focused models
|
| 210 |
-
model_id = kwargs.get('model', '')
|
| 211 |
-
slow_model_keywords = [
|
| 212 |
-
'claude-3.5', 'claude-3.7', 'claude-4', 'sonnet-4', # Claude
|
| 213 |
-
'gpt-4', 'o1', 'o3', # OpenAI
|
| 214 |
-
'gemini-2.5', 'gemini-pro', # Google
|
| 215 |
-
'llama-4', # Meta
|
| 216 |
-
'reasoning', 'thinking' # General
|
| 217 |
-
]
|
| 218 |
-
timeout = 120 if any(keyword in model_id for keyword in slow_model_keywords) else 60
|
| 219 |
-
|
| 220 |
async with openrouter_rate_limit:
|
| 221 |
try:
|
| 222 |
-
response = await
|
| 223 |
-
client.chat.completions.create(**kwargs),
|
| 224 |
-
timeout=timeout
|
| 225 |
-
)
|
| 226 |
except BadRequestError as e:
|
| 227 |
if "filtered" in e.message:
|
| 228 |
return None
|
| 229 |
raise e
|
| 230 |
-
except asyncio.TimeoutError:
|
| 231 |
-
return None
|
| 232 |
if not response.choices:
|
| 233 |
raise Exception(response)
|
| 234 |
return response.choices[0].message.content.strip()
|
| 235 |
|
| 236 |
-
translate_client = None
|
| 237 |
-
|
| 238 |
-
def get_google_translate_client():
|
| 239 |
-
global translate_client
|
| 240 |
-
if translate_client is None:
|
| 241 |
-
translate_client = translate.Client()
|
| 242 |
-
return translate_client
|
| 243 |
-
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
return [l["language"] for l in client.get_languages()]
|
| 248 |
|
| 249 |
|
| 250 |
@cache
|
| 251 |
async def translate_google(text, source_language, target_language):
|
| 252 |
-
client = get_google_translate_client()
|
| 253 |
async with google_rate_limit:
|
| 254 |
-
response =
|
| 255 |
text, source_language=source_language, target_language=target_language
|
| 256 |
)
|
| 257 |
return response["translatedText"]
|
|
@@ -315,14 +231,12 @@ def get_hf_metadata(row):
|
|
| 315 |
return empty
|
| 316 |
try:
|
| 317 |
info = api.model_info(id)
|
| 318 |
-
license =
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
.title()
|
| 325 |
-
)
|
| 326 |
return {
|
| 327 |
"hf_id": info.id,
|
| 328 |
"creation_date": info.created_at,
|
|
@@ -335,14 +249,8 @@ def get_hf_metadata(row):
|
|
| 335 |
|
| 336 |
|
| 337 |
def get_cost(row):
|
| 338 |
-
"""
|
| 339 |
-
|
| 340 |
-
"""
|
| 341 |
-
try:
|
| 342 |
-
cost = float(row["endpoint"]["pricing"]["completion"])
|
| 343 |
-
return round(cost * 1_000_000, 2)
|
| 344 |
-
except (TypeError, KeyError):
|
| 345 |
-
return None
|
| 346 |
|
| 347 |
|
| 348 |
@cache
|
|
@@ -352,17 +260,8 @@ def load_models(date: date):
|
|
| 352 |
+ get_current_popular_models(date.today())[:10]
|
| 353 |
)
|
| 354 |
popular_models = [m["slug"] for m in popular_models]
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
# Validate models exist on OpenRouter before including them
|
| 358 |
-
valid_models = []
|
| 359 |
-
|
| 360 |
-
for model_id in all_model_candidates:
|
| 361 |
-
metadata = get_or_metadata(model_id)
|
| 362 |
-
if metadata is not None:
|
| 363 |
-
valid_models.append(model_id)
|
| 364 |
-
|
| 365 |
-
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
| 366 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 367 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 368 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
|
@@ -382,8 +281,7 @@ def load_models(date: date):
|
|
| 382 |
license=hf_metadata.str["license"],
|
| 383 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 384 |
)
|
| 385 |
-
#
|
| 386 |
-
models = models[models["cost"] <= 15.0].reset_index(drop=True)
|
| 387 |
models["tasks"] = [
|
| 388 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
| 389 |
] * len(models)
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
from collections import defaultdict
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
from aiolimiter import AsyncLimiter
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
+
from elevenlabs import AsyncElevenLabs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from google.cloud import translate_v2 as translate
|
| 12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 13 |
from joblib.memory import Memory
|
|
|
|
| 22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
|
|
|
|
|
|
| 25 |
"openai/gpt-4.1", # 8$
|
| 26 |
"openai/gpt-4.1-mini", # 1.6$
|
| 27 |
"openai/gpt-4.1-nano", # 0.4$
|
| 28 |
"openai/gpt-4o-mini", # 0.6$
|
| 29 |
+
# "openai/gpt-4o-2024-11-20", # 10$
|
| 30 |
+
"openai/gpt-3.5-turbo-0613", # 2$
|
| 31 |
+
# "openai/gpt-3.5-turbo", # 1.5$
|
| 32 |
+
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
|
|
|
| 33 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 34 |
"mistralai/mistral-saba", # 0.6$
|
| 35 |
"mistralai/mistral-nemo", # 0.08$
|
|
|
|
| 48 |
"microsoft/phi-4", # 0.07$
|
| 49 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
| 50 |
"amazon/nova-micro-v1", # 0.09$
|
|
|
|
|
|
|
| 51 |
]
|
| 52 |
|
| 53 |
blocklist = [
|
| 54 |
"google/gemini-2.5-pro-preview",
|
|
|
|
| 55 |
"google/gemini-2.5-flash-preview",
|
| 56 |
"google/gemini-2.5-flash-lite-preview",
|
| 57 |
"google/gemini-2.5-flash-preview-04-17",
|
|
|
|
| 59 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
| 60 |
"google/gemini-2.5-pro-preview-06-05",
|
| 61 |
"google/gemini-2.5-pro-preview-05-06",
|
|
|
|
| 62 |
]
|
| 63 |
|
| 64 |
transcription_models = [
|
|
|
|
| 85 |
and m["endpoint"]
|
| 86 |
and not m["endpoint"]["is_free"]
|
| 87 |
]
|
| 88 |
+
if len(slugs) == 0:
|
| 89 |
+
# the problem is that free models typically have very high rate-limiting
|
| 90 |
+
print(f"no non-free model found for {permaslug}")
|
| 91 |
return slugs[0] if len(slugs) >= 1 else None
|
| 92 |
|
| 93 |
|
| 94 |
@cache
|
| 95 |
def get_historical_popular_models(date: date):
|
| 96 |
+
raw = get("https://openrouter.ai/rankings").text
|
| 97 |
+
data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
|
| 98 |
+
data = json.loads(data.replace("\\", ""))
|
| 99 |
+
counts = defaultdict(int)
|
| 100 |
+
for day in data:
|
| 101 |
+
for model, count in day["ys"].items():
|
| 102 |
+
if model.startswith("openrouter") or model == "Others":
|
| 103 |
+
continue
|
| 104 |
+
counts[model.split(":")[0]] += count
|
| 105 |
+
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
| 106 |
+
models = [get_model(model) for model, _ in counts]
|
| 107 |
+
return [m for m in models if m]
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@cache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def get_current_popular_models(date: date):
|
| 112 |
+
raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
|
| 113 |
+
data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
|
| 114 |
+
data = json.loads(data)
|
| 115 |
+
data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
|
| 116 |
+
models = [get_model(model["model_permaslug"]) for model in data]
|
| 117 |
+
return [m for m in models if m]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def get_translation_models():
|
|
|
|
| 148 |
|
| 149 |
@cache
|
| 150 |
async def complete(**kwargs) -> str | None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
async with openrouter_rate_limit:
|
| 152 |
try:
|
| 153 |
+
response = await client.chat.completions.create(**kwargs)
|
|
|
|
|
|
|
|
|
|
| 154 |
except BadRequestError as e:
|
| 155 |
if "filtered" in e.message:
|
| 156 |
return None
|
| 157 |
raise e
|
|
|
|
|
|
|
| 158 |
if not response.choices:
|
| 159 |
raise Exception(response)
|
| 160 |
return response.choices[0].message.content.strip()
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
translate_client = translate.Client()
|
| 164 |
+
google_supported_languages = [l["language"] for l in translate_client.get_languages()]
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
@cache
|
| 168 |
async def translate_google(text, source_language, target_language):
|
|
|
|
| 169 |
async with google_rate_limit:
|
| 170 |
+
response = translate_client.translate(
|
| 171 |
text, source_language=source_language, target_language=target_language
|
| 172 |
)
|
| 173 |
return response["translatedText"]
|
|
|
|
| 231 |
return empty
|
| 232 |
try:
|
| 233 |
info = api.model_info(id)
|
| 234 |
+
license = (
|
| 235 |
+
(info.card_data.license or "")
|
| 236 |
+
.replace("-", " ")
|
| 237 |
+
.replace("mit", "MIT")
|
| 238 |
+
.title()
|
| 239 |
+
)
|
|
|
|
|
|
|
| 240 |
return {
|
| 241 |
"hf_id": info.id,
|
| 242 |
"creation_date": info.created_at,
|
|
|
|
| 249 |
|
| 250 |
|
| 251 |
def get_cost(row):
|
| 252 |
+
cost = float(row["endpoint"]["pricing"]["completion"])
|
| 253 |
+
return round(cost * 1_000_000, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
@cache
|
|
|
|
| 260 |
+ get_current_popular_models(date.today())[:10]
|
| 261 |
)
|
| 262 |
popular_models = [m["slug"] for m in popular_models]
|
| 263 |
+
models = set(important_models + popular_models) - set(blocklist)
|
| 264 |
+
models = pd.DataFrame(sorted(list(models)), columns=["id"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 266 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 267 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
|
|
|
| 281 |
license=hf_metadata.str["license"],
|
| 282 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 283 |
)
|
| 284 |
+
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
|
|
|
| 285 |
models["tasks"] = [
|
| 286 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
| 287 |
] * len(models)
|
evals/tasks.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import random
|
| 3 |
from functools import partial
|
| 4 |
from textwrap import dedent
|
|
@@ -11,8 +10,10 @@ from datasets_.mgsm import load_mgsm, parse_number
|
|
| 11 |
from datasets_.mmlu import load_mmlu
|
| 12 |
from datasets_.arc import load_uhura_arc_easy
|
| 13 |
from datasets_.truthfulqa import load_truthfulqa
|
|
|
|
|
|
|
| 14 |
from languages import languages, script_name
|
| 15 |
-
from models import complete, transcribe
|
| 16 |
|
| 17 |
bleu = evaluate.load("bleu")
|
| 18 |
chrf = evaluate.load("chrf")
|
|
@@ -26,6 +27,9 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
| 26 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 27 |
)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 31 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
@@ -43,20 +47,31 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 43 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 44 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 45 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
if prediction:
|
| 61 |
bleu_score = bleu.compute(
|
| 62 |
predictions=[prediction],
|
|
@@ -69,9 +84,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 69 |
else:
|
| 70 |
bleu_score = {"bleu": 0}
|
| 71 |
chrf_score = {"score": 0}
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
return [
|
| 76 |
{
|
| 77 |
"model": model,
|
|
@@ -79,7 +91,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 79 |
"task": f"translation_{mode}",
|
| 80 |
"metric": metric,
|
| 81 |
"score": score,
|
| 82 |
-
"origin": "human", # FLORES+ is human-translated
|
| 83 |
"sentence_nr": sentence_nr,
|
| 84 |
}
|
| 85 |
for metric, score in (
|
|
@@ -101,36 +112,57 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 101 |
)
|
| 102 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 103 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
response = await complete(
|
| 113 |
-
model=model,
|
| 114 |
-
messages=[{"role": "user", "content": prompt}],
|
| 115 |
-
temperature=0,
|
| 116 |
-
max_tokens=30,
|
| 117 |
)
|
| 118 |
-
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
true = test_paragraph.topic.lower().strip()
|
| 123 |
-
others = [t for t in top_topics if t != true]
|
| 124 |
-
acc = (
|
| 125 |
-
int(
|
| 126 |
-
pred.startswith(true)
|
| 127 |
-
or (true in pred and not any(o in pred for o in others))
|
| 128 |
-
)
|
| 129 |
-
if pred
|
| 130 |
-
else 0
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
return [
|
| 135 |
{
|
| 136 |
"model": model,
|
|
@@ -138,7 +170,6 @@ Text:
|
|
| 138 |
"task": "classification",
|
| 139 |
"metric": "accuracy",
|
| 140 |
"score": acc,
|
| 141 |
-
"origin": "human", # FLORES+ is human-translated
|
| 142 |
"sentence_nr": nr,
|
| 143 |
}
|
| 144 |
]
|
|
@@ -203,41 +234,37 @@ def format_multiple_choice(item):
|
|
| 203 |
C: {item["choices"][2]}
|
| 204 |
D: {item["choices"][3]}
|
| 205 |
|
| 206 |
-
|
| 207 |
|
| 208 |
|
| 209 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 210 |
-
ds_name,
|
| 211 |
if not task:
|
| 212 |
return []
|
| 213 |
-
|
| 214 |
-
messages = [
|
| 215 |
-
{
|
| 216 |
-
"role": "user",
|
| 217 |
-
"content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
{
|
| 224 |
-
|
| 225 |
-
]
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
| 241 |
return [
|
| 242 |
{
|
| 243 |
"model": model,
|
|
@@ -245,41 +272,39 @@ Response format: <reasoning> #### <letter>
|
|
| 245 |
"task": "mmlu",
|
| 246 |
"metric": "accuracy",
|
| 247 |
"score": acc,
|
| 248 |
-
"origin": origin, # Add origin tag to results
|
| 249 |
"sentence_nr": nr,
|
| 250 |
}
|
| 251 |
]
|
| 252 |
|
| 253 |
|
| 254 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 255 |
-
ds_name,
|
| 256 |
if not task:
|
| 257 |
return []
|
| 258 |
|
| 259 |
-
messages = [
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
"
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
answer = "NO_ANSWER"
|
| 283 |
return [
|
| 284 |
{
|
| 285 |
"model": model,
|
|
@@ -287,7 +312,6 @@ Response format: <reasoning> #### <letter>
|
|
| 287 |
"task": "arc",
|
| 288 |
"metric": "accuracy",
|
| 289 |
"score": acc,
|
| 290 |
-
"origin": origin,
|
| 291 |
"sentence_nr": nr,
|
| 292 |
}
|
| 293 |
]
|
|
@@ -308,48 +332,40 @@ def format_multiple_choice_truthfulqa(item):
|
|
| 308 |
text = item["question"] + "\n\n"
|
| 309 |
for i, choice in enumerate(item["choices"]):
|
| 310 |
text += f"{letters[i]}: {choice}\n"
|
|
|
|
| 311 |
return text
|
| 312 |
|
| 313 |
|
| 314 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 315 |
-
ds_name,
|
| 316 |
if not task:
|
| 317 |
return []
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
try:
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
{format_multiple_choice_truthfulqa(task)}""",
|
| 337 |
-
},
|
| 338 |
-
]
|
| 339 |
-
response = await complete(
|
| 340 |
-
model=model,
|
| 341 |
-
messages=messages,
|
| 342 |
-
temperature=0,
|
| 343 |
-
max_tokens=1024, # Increased for reasoning
|
| 344 |
-
)
|
| 345 |
-
if response and "####" in response:
|
| 346 |
-
pred_answer = response.split("####")[-1].strip()
|
| 347 |
-
acc = int(pred_answer[:1].upper() == answer)
|
| 348 |
-
else:
|
| 349 |
-
acc = 0
|
| 350 |
-
pred_answer = "NO_ANSWER"
|
| 351 |
-
|
| 352 |
-
|
| 353 |
return [
|
| 354 |
{
|
| 355 |
"model": model,
|
|
@@ -357,43 +373,34 @@ Response format: <reasoning> #### <letter>
|
|
| 357 |
"task": "truthfulqa",
|
| 358 |
"metric": "accuracy",
|
| 359 |
"score": acc,
|
| 360 |
-
"origin": origin,
|
| 361 |
"sentence_nr": nr,
|
| 362 |
}
|
| 363 |
]
|
| 364 |
|
| 365 |
|
| 366 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
if not question:
|
| 369 |
return []
|
| 370 |
-
|
| 371 |
-
messages = [
|
| 372 |
-
{
|
| 373 |
-
"role": "user",
|
| 374 |
-
"content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
|
| 375 |
-
|
| 376 |
-
Response format: <reasoning> #### <number>
|
| 377 |
-
|
| 378 |
-
---
|
| 379 |
-
|
| 380 |
-
{question["question"]}""",
|
| 381 |
-
},
|
| 382 |
-
]
|
| 383 |
response = await complete(
|
| 384 |
model=model,
|
| 385 |
-
messages=
|
|
|
|
|
|
|
|
|
|
| 386 |
temperature=0,
|
| 387 |
max_tokens=1024,
|
| 388 |
)
|
| 389 |
-
if response and "####"
|
| 390 |
number = response.split("####")[1].strip()
|
| 391 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 392 |
else:
|
| 393 |
accuracy = 0
|
| 394 |
-
number = "NO_ANSWER"
|
| 395 |
-
|
| 396 |
-
|
| 397 |
|
| 398 |
return [
|
| 399 |
{
|
|
@@ -402,7 +409,6 @@ Response format: <reasoning> #### <number>
|
|
| 402 |
"task": "mgsm",
|
| 403 |
"metric": "accuracy",
|
| 404 |
"score": accuracy,
|
| 405 |
-
"origin": origin,
|
| 406 |
"sentence_nr": nr,
|
| 407 |
}
|
| 408 |
]
|
|
@@ -443,8 +449,10 @@ tasks = {
|
|
| 443 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 444 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 445 |
"classification": classify_and_evaluate,
|
|
|
|
| 446 |
"mmlu": mmlu_and_evaluate,
|
| 447 |
"arc": arc_and_evaluate,
|
| 448 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 449 |
"mgsm": mgsm_and_evaluate,
|
|
|
|
| 450 |
}
|
|
|
|
|
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
| 3 |
from textwrap import dedent
|
|
|
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
from datasets_.arc import load_uhura_arc_easy
|
| 12 |
from datasets_.truthfulqa import load_truthfulqa
|
| 13 |
+
from google.cloud import translate_v2 as translate
|
| 14 |
+
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
+
from models import complete, transcribe, translate_google
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
|
|
| 27 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 28 |
)
|
| 29 |
|
| 30 |
+
translate_client = translate.Client()
|
| 31 |
+
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 32 |
+
|
| 33 |
|
| 34 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 35 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
| 47 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 48 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 49 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 50 |
+
if model == "google/translate-v2":
|
| 51 |
+
original_language = closest_supported_match(
|
| 52 |
+
original_language, supported_languages
|
| 53 |
+
)
|
| 54 |
+
target_language = closest_supported_match(target_language, supported_languages)
|
| 55 |
+
if original_language == target_language:
|
| 56 |
+
prediction = original_sentence
|
| 57 |
+
elif original_language is None or target_language is None:
|
| 58 |
+
prediction = None
|
| 59 |
+
else:
|
| 60 |
+
prediction = await translate_google(
|
| 61 |
+
original_sentence, original_language.bcp_47, target_language.bcp_47
|
| 62 |
+
)
|
| 63 |
+
else:
|
| 64 |
+
prediction = await complete(
|
| 65 |
+
model=model,
|
| 66 |
+
messages=[
|
| 67 |
+
{
|
| 68 |
+
"role": "user",
|
| 69 |
+
"content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
temperature=0,
|
| 73 |
+
max_tokens=1024,
|
| 74 |
+
)
|
| 75 |
if prediction:
|
| 76 |
bleu_score = bleu.compute(
|
| 77 |
predictions=[prediction],
|
|
|
|
| 84 |
else:
|
| 85 |
bleu_score = {"bleu": 0}
|
| 86 |
chrf_score = {"score": 0}
|
|
|
|
|
|
|
|
|
|
| 87 |
return [
|
| 88 |
{
|
| 89 |
"model": model,
|
|
|
|
| 91 |
"task": f"translation_{mode}",
|
| 92 |
"metric": metric,
|
| 93 |
"score": score,
|
|
|
|
| 94 |
"sentence_nr": sentence_nr,
|
| 95 |
}
|
| 96 |
for metric, score in (
|
|
|
|
| 112 |
)
|
| 113 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 114 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 115 |
+
examples = pd.concat(
|
| 116 |
+
[
|
| 117 |
+
paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
|
| 118 |
+
for t in top_topics
|
| 119 |
+
]
|
| 120 |
+
).sample(frac=1, random_state=nr)
|
| 121 |
+
test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
|
| 122 |
+
frac=1, random_state=42
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
)
|
| 124 |
+
test_paragraph = test_paragraphs.iloc[nr]
|
| 125 |
|
| 126 |
+
def format_prompt(text):
|
| 127 |
+
return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
messages = []
|
| 130 |
+
for example in examples.itertuples():
|
| 131 |
+
messages += [
|
| 132 |
+
{"role": "user", "content": format_prompt(example.text)},
|
| 133 |
+
{"role": "assistant", "content": example.topic},
|
| 134 |
+
]
|
| 135 |
+
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 136 |
+
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 137 |
+
try:
|
| 138 |
+
pred = await complete(
|
| 139 |
+
model=model,
|
| 140 |
+
messages=[
|
| 141 |
+
*messages,
|
| 142 |
+
{
|
| 143 |
+
"role": "user",
|
| 144 |
+
"content": format_prompt(test_paragraph.text),
|
| 145 |
+
},
|
| 146 |
+
],
|
| 147 |
+
temperature=0,
|
| 148 |
+
max_tokens=30,
|
| 149 |
+
)
|
| 150 |
+
true = test_paragraph.topic
|
| 151 |
+
others = [t for t in top_topics if t != true]
|
| 152 |
+
acc = (
|
| 153 |
+
int(
|
| 154 |
+
pred.startswith(true)
|
| 155 |
+
or (true in pred and not any(o in pred for o in others))
|
| 156 |
+
)
|
| 157 |
+
if pred
|
| 158 |
+
else 0
|
| 159 |
+
)
|
| 160 |
+
except Exception as e:
|
| 161 |
+
if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
|
| 162 |
+
print(f"Max tokens exceeded for {model} in {bcp_47}")
|
| 163 |
+
acc = 0
|
| 164 |
+
else:
|
| 165 |
+
raise e
|
| 166 |
return [
|
| 167 |
{
|
| 168 |
"model": model,
|
|
|
|
| 170 |
"task": "classification",
|
| 171 |
"metric": "accuracy",
|
| 172 |
"score": acc,
|
|
|
|
| 173 |
"sentence_nr": nr,
|
| 174 |
}
|
| 175 |
]
|
|
|
|
| 234 |
C: {item["choices"][2]}
|
| 235 |
D: {item["choices"][3]}
|
| 236 |
|
| 237 |
+
A|B|C|D?"""
|
| 238 |
|
| 239 |
|
| 240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 241 |
+
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
| 242 |
if not task:
|
| 243 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
messages = []
|
| 246 |
+
for example in examples:
|
| 247 |
+
messages += [
|
| 248 |
+
{"role": "user", "content": format_multiple_choice(example)},
|
| 249 |
+
{"role": "assistant", "content": example["answer"]},
|
| 250 |
+
]
|
| 251 |
+
messages += [{"role": "user", "content": format_multiple_choice(task)}]
|
| 252 |
+
try:
|
| 253 |
+
response = await complete(
|
| 254 |
+
model=model,
|
| 255 |
+
messages=messages,
|
| 256 |
+
temperature=0,
|
| 257 |
+
max_tokens=1,
|
| 258 |
+
)
|
| 259 |
+
if response:
|
| 260 |
+
acc = int(response[:1].strip() == task["answer"])
|
| 261 |
+
else:
|
| 262 |
+
acc = 0
|
| 263 |
+
except Exception as e:
|
| 264 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
| 265 |
+
acc = 0
|
| 266 |
+
else:
|
| 267 |
+
raise e
|
| 268 |
return [
|
| 269 |
{
|
| 270 |
"model": model,
|
|
|
|
| 272 |
"task": "mmlu",
|
| 273 |
"metric": "accuracy",
|
| 274 |
"score": acc,
|
|
|
|
| 275 |
"sentence_nr": nr,
|
| 276 |
}
|
| 277 |
]
|
| 278 |
|
| 279 |
|
| 280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 281 |
+
ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
|
| 282 |
if not task:
|
| 283 |
return []
|
| 284 |
|
| 285 |
+
messages = []
|
| 286 |
+
for example in examples:
|
| 287 |
+
messages += [
|
| 288 |
+
{"role": "user", "content": format_multiple_choice(example)},
|
| 289 |
+
{"role": "assistant", "content": example["answer"]},
|
| 290 |
+
]
|
| 291 |
+
messages += [{"role": "user", "content": format_multiple_choice(task)}]
|
| 292 |
+
try:
|
| 293 |
+
response = await complete(
|
| 294 |
+
model=model,
|
| 295 |
+
messages=messages,
|
| 296 |
+
temperature=0,
|
| 297 |
+
max_tokens=1,
|
| 298 |
+
)
|
| 299 |
+
if response:
|
| 300 |
+
acc = int(response[:1].strip() == task["answer"])
|
| 301 |
+
else:
|
| 302 |
+
acc = 0
|
| 303 |
+
except Exception as e:
|
| 304 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
| 305 |
+
acc = 0
|
| 306 |
+
else:
|
| 307 |
+
raise e
|
|
|
|
| 308 |
return [
|
| 309 |
{
|
| 310 |
"model": model,
|
|
|
|
| 312 |
"task": "arc",
|
| 313 |
"metric": "accuracy",
|
| 314 |
"score": acc,
|
|
|
|
| 315 |
"sentence_nr": nr,
|
| 316 |
}
|
| 317 |
]
|
|
|
|
| 332 |
text = item["question"] + "\n\n"
|
| 333 |
for i, choice in enumerate(item["choices"]):
|
| 334 |
text += f"{letters[i]}: {choice}\n"
|
| 335 |
+
text += "|".join(letters[: len(item["choices"])]) + "?"
|
| 336 |
return text
|
| 337 |
|
| 338 |
|
| 339 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 340 |
+
ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
|
| 341 |
if not task:
|
| 342 |
return []
|
| 343 |
+
task = shuffle_choices_and_labels(task)
|
| 344 |
+
answer = letters[task["labels"].index(1)]
|
| 345 |
+
messages = []
|
| 346 |
+
for example in examples:
|
| 347 |
+
example = shuffle_choices_and_labels(example)
|
| 348 |
+
messages += [
|
| 349 |
+
{"role": "user", "content": format_multiple_choice_truthfulqa(example)},
|
| 350 |
+
{"role": "assistant", "content": letters[example["labels"].index(1)]},
|
| 351 |
+
]
|
| 352 |
+
messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
|
| 353 |
try:
|
| 354 |
+
response = await complete(
|
| 355 |
+
model=model,
|
| 356 |
+
messages=messages,
|
| 357 |
+
temperature=0,
|
| 358 |
+
max_tokens=1,
|
| 359 |
+
)
|
| 360 |
+
if response:
|
| 361 |
+
acc = int(response[:1].strip() == answer)
|
| 362 |
+
else:
|
| 363 |
+
acc = 0
|
| 364 |
+
except Exception as e:
|
| 365 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
| 366 |
+
acc = 0
|
| 367 |
+
else:
|
| 368 |
+
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
return [
|
| 370 |
{
|
| 371 |
"model": model,
|
|
|
|
| 373 |
"task": "truthfulqa",
|
| 374 |
"metric": "accuracy",
|
| 375 |
"score": acc,
|
|
|
|
| 376 |
"sentence_nr": nr,
|
| 377 |
}
|
| 378 |
]
|
| 379 |
|
| 380 |
|
| 381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 382 |
+
system_prompt = """
|
| 383 |
+
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
| 384 |
+
Response format: <reasoning> #### <number>
|
| 385 |
+
"""
|
| 386 |
+
system_prompt = dedent(system_prompt).strip()
|
| 387 |
+
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
| 388 |
if not question:
|
| 389 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
response = await complete(
|
| 391 |
model=model,
|
| 392 |
+
messages=[
|
| 393 |
+
{"role": "system", "content": system_prompt},
|
| 394 |
+
{"role": "user", "content": question["question"]},
|
| 395 |
+
],
|
| 396 |
temperature=0,
|
| 397 |
max_tokens=1024,
|
| 398 |
)
|
| 399 |
+
if response and len(response.split("####")) == 2:
|
| 400 |
number = response.split("####")[1].strip()
|
| 401 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 402 |
else:
|
| 403 |
accuracy = 0
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
return [
|
| 406 |
{
|
|
|
|
| 409 |
"task": "mgsm",
|
| 410 |
"metric": "accuracy",
|
| 411 |
"score": accuracy,
|
|
|
|
| 412 |
"sentence_nr": nr,
|
| 413 |
}
|
| 414 |
]
|
|
|
|
| 449 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 450 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 451 |
"classification": classify_and_evaluate,
|
| 452 |
+
# "mlm": mlm_and_evaluate,
|
| 453 |
"mmlu": mmlu_and_evaluate,
|
| 454 |
"arc": arc_and_evaluate,
|
| 455 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 456 |
"mgsm": mgsm_and_evaluate,
|
| 457 |
+
# "asr": transcribe_and_evaluate,
|
| 458 |
}
|
frontend/src/App.js
CHANGED
|
@@ -19,14 +19,9 @@ function App () {
|
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState(null)
|
| 21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
| 22 |
-
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
| 23 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 24 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 25 |
const [contributeVisible, setContributeVisible] = useState(false)
|
| 26 |
-
|
| 27 |
-
// Add state for carousel items
|
| 28 |
-
const [carouselItems, setCarouselItems] = useState([])
|
| 29 |
-
const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
|
| 30 |
|
| 31 |
useEffect(() => {
|
| 32 |
fetch('/api/data', {
|
|
@@ -41,7 +36,6 @@ function App () {
|
|
| 41 |
})
|
| 42 |
.then(jsonData => {
|
| 43 |
setData(jsonData)
|
| 44 |
-
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
| 45 |
setLoading(false)
|
| 46 |
})
|
| 47 |
.catch(err => {
|
|
@@ -50,27 +44,8 @@ function App () {
|
|
| 50 |
})
|
| 51 |
}, [selectedLanguages])
|
| 52 |
|
| 53 |
-
// Create carousel items when data is loaded
|
| 54 |
-
useEffect(() => {
|
| 55 |
-
if (data) {
|
| 56 |
-
// Add a small delay to ensure components are ready
|
| 57 |
-
const timer = setTimeout(() => {
|
| 58 |
-
setCarouselItems([
|
| 59 |
-
<WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
|
| 60 |
-
<LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
|
| 61 |
-
<SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
|
| 62 |
-
<HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
|
| 63 |
-
<CostPlot key="costplot-4" data={data} width={750} height={500} />
|
| 64 |
-
]);
|
| 65 |
-
}, 100);
|
| 66 |
-
|
| 67 |
-
return () => clearTimeout(timer);
|
| 68 |
-
}
|
| 69 |
-
}, [data])
|
| 70 |
-
|
| 71 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
| 72 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
| 73 |
-
|
| 74 |
useEffect(() => {
|
| 75 |
const handleResize = () => {
|
| 76 |
setWindowWidth(window.innerWidth)
|
|
@@ -80,44 +55,6 @@ function App () {
|
|
| 80 |
return () => window.removeEventListener('resize', handleResize)
|
| 81 |
}, [])
|
| 82 |
|
| 83 |
-
// Create full-screen carousel items when data or window size changes
|
| 84 |
-
useEffect(() => {
|
| 85 |
-
if (data) {
|
| 86 |
-
const timer = setTimeout(() => {
|
| 87 |
-
setFullScreenCarouselItems([
|
| 88 |
-
<WorldMap
|
| 89 |
-
key="fs-worldmap-0"
|
| 90 |
-
data={data.countries}
|
| 91 |
-
allLanguages={data.language_table}
|
| 92 |
-
width={windowWidth * 0.7}
|
| 93 |
-
height={windowHeight * 0.6}
|
| 94 |
-
/>,
|
| 95 |
-
<LanguagePlot
|
| 96 |
-
key="fs-langplot-1"
|
| 97 |
-
data={data}
|
| 98 |
-
width={windowWidth * 0.7}
|
| 99 |
-
height={windowHeight * 0.6}
|
| 100 |
-
/>,
|
| 101 |
-
<SpeakerPlot
|
| 102 |
-
key="fs-speakerplot-2"
|
| 103 |
-
data={data}
|
| 104 |
-
width={windowWidth * 0.7}
|
| 105 |
-
height={windowHeight * 0.6}
|
| 106 |
-
/>,
|
| 107 |
-
<HistoryPlot
|
| 108 |
-
key="fs-histplot-3"
|
| 109 |
-
data={data}
|
| 110 |
-
width={windowWidth * 0.7}
|
| 111 |
-
height={windowHeight * 0.6}
|
| 112 |
-
/>,
|
| 113 |
-
<CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
|
| 114 |
-
]);
|
| 115 |
-
}, 100);
|
| 116 |
-
|
| 117 |
-
return () => clearTimeout(timer);
|
| 118 |
-
}
|
| 119 |
-
}, [data, windowWidth, windowHeight])
|
| 120 |
-
|
| 121 |
return (
|
| 122 |
<PrimeReactProvider>
|
| 123 |
<div
|
|
@@ -132,50 +69,35 @@ function App () {
|
|
| 132 |
style={{
|
| 133 |
backgroundColor: '#fff3cd',
|
| 134 |
color: '#856404',
|
| 135 |
-
padding: '
|
| 136 |
marginBottom: '1rem',
|
| 137 |
border: '1px solid #ffeeba',
|
| 138 |
borderRadius: '0.25rem',
|
| 139 |
-
textAlign: 'center'
|
| 140 |
-
lineHeight: '1.5',
|
| 141 |
-
position: 'relative'
|
| 142 |
}}
|
| 143 |
>
|
| 144 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 145 |
-
active development. Evaluation results are not yet final.
|
| 146 |
-
</div>
|
| 147 |
-
<div
|
| 148 |
-
style={{
|
| 149 |
-
display: 'flex',
|
| 150 |
-
justifyContent: 'flex-end',
|
| 151 |
-
padding: '0 1.5rem',
|
| 152 |
-
marginBottom: '1rem'
|
| 153 |
-
}}
|
| 154 |
-
>
|
| 155 |
<a
|
| 156 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
| 157 |
target='_blank'
|
| 158 |
rel='noopener noreferrer'
|
| 159 |
style={{
|
| 160 |
textDecoration: 'none',
|
| 161 |
-
color: '#
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
display: 'flex',
|
| 169 |
-
alignItems: 'center',
|
| 170 |
-
gap: '0.5rem',
|
| 171 |
-
transition: 'all 0.2s ease',
|
| 172 |
-
':hover': {
|
| 173 |
-
backgroundColor: '#e9ecef',
|
| 174 |
-
color: '#495057'
|
| 175 |
-
}
|
| 176 |
}}
|
| 177 |
>
|
| 178 |
-
<i
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
GitHub
|
| 180 |
</a>
|
| 181 |
</div>
|
|
@@ -227,88 +149,39 @@ function App () {
|
|
| 227 |
<div
|
| 228 |
style={{
|
| 229 |
display: 'flex',
|
| 230 |
-
gap: '
|
| 231 |
-
marginBottom: '
|
| 232 |
flexWrap: 'wrap',
|
| 233 |
justifyContent: 'center'
|
| 234 |
}}
|
| 235 |
>
|
| 236 |
-
<
|
|
|
|
|
|
|
| 237 |
onClick={() => setAboutVisible(true)}
|
| 238 |
style={{
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
fontSize: '0.95rem',
|
| 245 |
-
fontWeight: '500',
|
| 246 |
-
cursor: 'pointer',
|
| 247 |
-
display: 'flex',
|
| 248 |
-
alignItems: 'center',
|
| 249 |
-
gap: '0.5rem',
|
| 250 |
-
boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
|
| 251 |
-
transition: 'all 0.3s ease',
|
| 252 |
-
':hover': {
|
| 253 |
-
transform: 'translateY(-2px)',
|
| 254 |
-
boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
|
| 255 |
-
}
|
| 256 |
-
}}
|
| 257 |
-
onMouseEnter={(e) => {
|
| 258 |
-
e.target.style.transform = 'translateY(-2px)';
|
| 259 |
-
e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
|
| 260 |
}}
|
| 261 |
-
|
| 262 |
-
e.target.style.transform = 'translateY(0)';
|
| 263 |
-
e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
|
| 264 |
-
}}
|
| 265 |
-
>
|
| 266 |
-
<span style={{ fontSize: '1.1rem' }}>📚</span>
|
| 267 |
-
About this tool
|
| 268 |
-
</button>
|
| 269 |
|
| 270 |
-
<
|
|
|
|
|
|
|
| 271 |
onClick={() => setContributeVisible(true)}
|
| 272 |
-
|
|
|
|
| 273 |
style={{
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
fontSize: '0.95rem',
|
| 280 |
-
fontWeight: '500',
|
| 281 |
-
cursor: 'pointer',
|
| 282 |
-
display: 'flex',
|
| 283 |
-
alignItems: 'center',
|
| 284 |
-
gap: '0.5rem',
|
| 285 |
-
boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
|
| 286 |
-
transition: 'all 0.3s ease',
|
| 287 |
-
position: 'relative',
|
| 288 |
-
overflow: 'hidden'
|
| 289 |
}}
|
| 290 |
-
|
| 291 |
-
e.target.style.transform = 'translateY(-2px)';
|
| 292 |
-
e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
|
| 293 |
-
}}
|
| 294 |
-
onMouseLeave={(e) => {
|
| 295 |
-
e.target.style.transform = 'translateY(0)';
|
| 296 |
-
e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
|
| 297 |
-
}}
|
| 298 |
-
>
|
| 299 |
-
<span style={{ fontSize: '1.1rem' }}>🚀</span>
|
| 300 |
-
Add your model
|
| 301 |
-
<span style={{
|
| 302 |
-
fontSize: '0.75rem',
|
| 303 |
-
backgroundColor: 'rgba(107, 70, 193, 0.15)',
|
| 304 |
-
padding: '0.2rem 0.5rem',
|
| 305 |
-
borderRadius: '6px',
|
| 306 |
-
marginLeft: '0.5rem',
|
| 307 |
-
fontWeight: '600'
|
| 308 |
-
}}>
|
| 309 |
-
soon
|
| 310 |
-
</span>
|
| 311 |
-
</button>
|
| 312 |
</div>
|
| 313 |
|
| 314 |
{data && (
|
|
@@ -347,7 +220,6 @@ function App () {
|
|
| 347 |
data={data.model_table}
|
| 348 |
selectedLanguages={selectedLanguages}
|
| 349 |
allLanguages={data.language_table || []}
|
| 350 |
-
machineTranslatedMetrics={machineTranslatedMetrics}
|
| 351 |
/>
|
| 352 |
<LanguageTable
|
| 353 |
data={data.language_table}
|
|
@@ -376,18 +248,20 @@ function App () {
|
|
| 376 |
color: '#666'
|
| 377 |
}}
|
| 378 |
/>
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
|
|
|
|
|
|
| 391 |
</div>
|
| 392 |
</>
|
| 393 |
)}
|
|
@@ -535,16 +409,36 @@ function App () {
|
|
| 535 |
modal
|
| 536 |
header={null}
|
| 537 |
>
|
| 538 |
-
{
|
| 539 |
<div style={{ width: '100%', height: '100%' }}>
|
| 540 |
<Carousel
|
| 541 |
-
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
numScroll={1}
|
| 544 |
numVisible={1}
|
| 545 |
itemTemplate={item => item}
|
| 546 |
-
circular
|
| 547 |
-
activeIndex={0}
|
| 548 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
| 549 |
/>
|
| 550 |
</div>
|
|
@@ -555,4 +449,4 @@ function App () {
|
|
| 555 |
)
|
| 556 |
}
|
| 557 |
|
| 558 |
-
export default App
|
|
|
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState(null)
|
| 21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
|
|
|
| 22 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 23 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 24 |
const [contributeVisible, setContributeVisible] = useState(false)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
useEffect(() => {
|
| 27 |
fetch('/api/data', {
|
|
|
|
| 36 |
})
|
| 37 |
.then(jsonData => {
|
| 38 |
setData(jsonData)
|
|
|
|
| 39 |
setLoading(false)
|
| 40 |
})
|
| 41 |
.catch(err => {
|
|
|
|
| 44 |
})
|
| 45 |
}, [selectedLanguages])
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
| 48 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
|
|
|
| 49 |
useEffect(() => {
|
| 50 |
const handleResize = () => {
|
| 51 |
setWindowWidth(window.innerWidth)
|
|
|
|
| 55 |
return () => window.removeEventListener('resize', handleResize)
|
| 56 |
}, [])
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return (
|
| 59 |
<PrimeReactProvider>
|
| 60 |
<div
|
|
|
|
| 69 |
style={{
|
| 70 |
backgroundColor: '#fff3cd',
|
| 71 |
color: '#856404',
|
| 72 |
+
padding: '0.75rem 1.25rem',
|
| 73 |
marginBottom: '1rem',
|
| 74 |
border: '1px solid #ffeeba',
|
| 75 |
borderRadius: '0.25rem',
|
| 76 |
+
textAlign: 'center'
|
|
|
|
|
|
|
| 77 |
}}
|
| 78 |
>
|
| 79 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 80 |
+
active development. Evaluation results are not yet final.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
<a
|
| 82 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
| 83 |
target='_blank'
|
| 84 |
rel='noopener noreferrer'
|
| 85 |
style={{
|
| 86 |
textDecoration: 'none',
|
| 87 |
+
color: '#856404',
|
| 88 |
+
float: 'right',
|
| 89 |
+
fontSize: '1.2rem',
|
| 90 |
+
fontWeight: 'bold',
|
| 91 |
+
padding: '0 0.5rem',
|
| 92 |
+
borderRadius: '3px',
|
| 93 |
+
backgroundColor: 'rgba(255,255,255,0.3)'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
}}
|
| 95 |
>
|
| 96 |
+
<i
|
| 97 |
+
className='pi pi-github'
|
| 98 |
+
title='View on GitHub'
|
| 99 |
+
style={{ marginRight: '0.3rem' }}
|
| 100 |
+
/>
|
| 101 |
GitHub
|
| 102 |
</a>
|
| 103 |
</div>
|
|
|
|
| 149 |
<div
|
| 150 |
style={{
|
| 151 |
display: 'flex',
|
| 152 |
+
gap: '1rem',
|
| 153 |
+
marginBottom: '1.5rem',
|
| 154 |
flexWrap: 'wrap',
|
| 155 |
justifyContent: 'center'
|
| 156 |
}}
|
| 157 |
>
|
| 158 |
+
<Button
|
| 159 |
+
label='📚 About this tool'
|
| 160 |
+
className='p-button-text'
|
| 161 |
onClick={() => setAboutVisible(true)}
|
| 162 |
style={{
|
| 163 |
+
color: '#666',
|
| 164 |
+
border: '1px solid #ddd',
|
| 165 |
+
padding: '0.5rem 1rem',
|
| 166 |
+
borderRadius: '4px',
|
| 167 |
+
fontSize: '0.9rem'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
}}
|
| 169 |
+
/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
<Button
|
| 172 |
+
label='🚀 Add your model (soon)'
|
| 173 |
+
className='p-button-text'
|
| 174 |
onClick={() => setContributeVisible(true)}
|
| 175 |
+
tooltip='This feature is on our roadmap and will be available soon.'
|
| 176 |
+
tooltipOptions={{ position: 'bottom' }}
|
| 177 |
style={{
|
| 178 |
+
color: '#666',
|
| 179 |
+
border: '1px solid #ddd',
|
| 180 |
+
padding: '0.5rem 1rem',
|
| 181 |
+
borderRadius: '4px',
|
| 182 |
+
fontSize: '0.9rem'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
}}
|
| 184 |
+
/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
</div>
|
| 186 |
|
| 187 |
{data && (
|
|
|
|
| 220 |
data={data.model_table}
|
| 221 |
selectedLanguages={selectedLanguages}
|
| 222 |
allLanguages={data.language_table || []}
|
|
|
|
| 223 |
/>
|
| 224 |
<LanguageTable
|
| 225 |
data={data.language_table}
|
|
|
|
| 248 |
color: '#666'
|
| 249 |
}}
|
| 250 |
/>
|
| 251 |
+
<Carousel
|
| 252 |
+
value={[
|
| 253 |
+
<WorldMap data={data.countries} />,
|
| 254 |
+
<LanguagePlot data={data} />,
|
| 255 |
+
<SpeakerPlot data={data} />,
|
| 256 |
+
<HistoryPlot data={data} />,
|
| 257 |
+
<CostPlot data={data} />
|
| 258 |
+
]}
|
| 259 |
+
numScroll={1}
|
| 260 |
+
numVisible={1}
|
| 261 |
+
itemTemplate={item => item}
|
| 262 |
+
circular
|
| 263 |
+
style={{ width: '100%', minHeight: '650px' }}
|
| 264 |
+
/>
|
| 265 |
</div>
|
| 266 |
</>
|
| 267 |
)}
|
|
|
|
| 409 |
modal
|
| 410 |
header={null}
|
| 411 |
>
|
| 412 |
+
{data && (
|
| 413 |
<div style={{ width: '100%', height: '100%' }}>
|
| 414 |
<Carousel
|
| 415 |
+
value={[
|
| 416 |
+
<WorldMap
|
| 417 |
+
data={data.countries}
|
| 418 |
+
width={windowWidth * 0.7}
|
| 419 |
+
height={windowHeight * 0.6}
|
| 420 |
+
/>,
|
| 421 |
+
<LanguagePlot
|
| 422 |
+
data={data}
|
| 423 |
+
width={windowWidth * 0.7}
|
| 424 |
+
height={windowHeight * 0.6}
|
| 425 |
+
/>,
|
| 426 |
+
<SpeakerPlot
|
| 427 |
+
data={data}
|
| 428 |
+
width={windowWidth * 0.7}
|
| 429 |
+
height={windowHeight * 0.6}
|
| 430 |
+
/>,
|
| 431 |
+
<HistoryPlot
|
| 432 |
+
data={data}
|
| 433 |
+
width={windowWidth * 0.7}
|
| 434 |
+
height={windowHeight * 0.6}
|
| 435 |
+
/>,
|
| 436 |
+
<CostPlot data={data} />
|
| 437 |
+
]}
|
| 438 |
numScroll={1}
|
| 439 |
numVisible={1}
|
| 440 |
itemTemplate={item => item}
|
| 441 |
+
circular
|
|
|
|
| 442 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
| 443 |
/>
|
| 444 |
</div>
|
|
|
|
| 449 |
)
|
| 450 |
}
|
| 451 |
|
| 452 |
+
export default App
|
frontend/src/components/HistoryPlot.js
CHANGED
|
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 50 |
...models.filter(d => d.newRecord),
|
| 51 |
{
|
| 52 |
creation_date: new Date(),
|
| 53 |
-
maxAverage: models[models.length - 1]
|
| 54 |
}
|
| 55 |
],
|
| 56 |
{
|
| 57 |
x: d => d.creation_date,
|
| 58 |
-
y: d => d.maxAverage
|
| 59 |
curve: 'step-after',
|
| 60 |
strokeOpacity: 0.3
|
| 61 |
}
|
|
|
|
| 50 |
...models.filter(d => d.newRecord),
|
| 51 |
{
|
| 52 |
creation_date: new Date(),
|
| 53 |
+
maxAverage: models[models.length - 1].maxAverage
|
| 54 |
}
|
| 55 |
],
|
| 56 |
{
|
| 57 |
x: d => d.creation_date,
|
| 58 |
+
y: d => d.maxAverage,
|
| 59 |
curve: 'step-after',
|
| 60 |
strokeOpacity: 0.3
|
| 61 |
}
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
|
|
| 172 |
filterElement={familyRowFilterTemplate}
|
| 173 |
style={{ minWidth: '10rem' }}
|
| 174 |
/>
|
| 175 |
-
{ScoreColumns
|
| 176 |
</DataTable>
|
| 177 |
)
|
| 178 |
}
|
|
|
|
| 172 |
filterElement={familyRowFilterTemplate}
|
| 173 |
style={{ minWidth: '10rem' }}
|
| 174 |
/>
|
| 175 |
+
{ScoreColumns}
|
| 176 |
</DataTable>
|
| 177 |
)
|
| 178 |
}
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
-
const ModelTable = ({ data, selectedLanguages = [], allLanguages = []
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 50 |
}
|
| 51 |
|
| 52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
| 53 |
-
const p = 10
|
| 54 |
-
const start = value === null
|
| 55 |
-
const stop = value === null
|
| 56 |
-
const [_value, _setValue] = useState([start, stop])
|
| 57 |
useEffect(() => {
|
| 58 |
const timer = setTimeout(() => {
|
| 59 |
onChange({
|
|
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 61 |
// set to "no filter" when (almost) the whole range is selected
|
| 62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
| 63 |
? null
|
| 64 |
-
: [p ** _value[0], p ** _value[1]]
|
| 65 |
-
})
|
| 66 |
-
}, 1000)
|
| 67 |
-
return () => clearTimeout(timer)
|
| 68 |
-
}, [_value, onChange, min, max])
|
| 69 |
return (
|
| 70 |
<div style={{ minWidth: '20rem' }}>
|
| 71 |
<div>{formatSize(p ** _value[0])}</div>
|
|
@@ -147,35 +147,21 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 147 |
}
|
| 148 |
|
| 149 |
const costBodyTemplate = rowData => {
|
| 150 |
-
return (
|
| 151 |
-
<div style={{ textAlign: 'center' }}>
|
| 152 |
-
{rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
|
| 153 |
-
</div>
|
| 154 |
-
)
|
| 155 |
}
|
| 156 |
|
| 157 |
const getHeaderText = () => {
|
| 158 |
-
// Count languages that have
|
| 159 |
-
const evaluatedLanguagesCount = allLanguages.filter(lang =>
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
'translation_from_bleu',
|
| 163 |
-
'translation_to_bleu',
|
| 164 |
-
'classification_accuracy',
|
| 165 |
-
'mmlu_accuracy',
|
| 166 |
-
'arc_accuracy',
|
| 167 |
-
'truthfulqa_accuracy',
|
| 168 |
-
'mgsm_accuracy'
|
| 169 |
-
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 170 |
-
return hasAnyScores
|
| 171 |
-
}).length
|
| 172 |
|
| 173 |
if (selectedLanguages.length === 0) {
|
| 174 |
return (
|
| 175 |
<span>
|
| 176 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 177 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 178 |
-
|
| 179 |
</span>
|
| 180 |
</span>
|
| 181 |
)
|
|
@@ -259,7 +245,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 259 |
body={costBodyTemplate}
|
| 260 |
style={{ minWidth: '5rem' }}
|
| 261 |
/>
|
| 262 |
-
{ScoreColumns
|
| 263 |
</DataTable>
|
| 264 |
)
|
| 265 |
}
|
|
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
+
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
| 53 |
+
const p = 10
|
| 54 |
+
const start = value === null ? min : Math.log(value[0]) / Math.log(p)
|
| 55 |
+
const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
|
| 56 |
+
const [_value, _setValue] = useState([start, stop])
|
| 57 |
useEffect(() => {
|
| 58 |
const timer = setTimeout(() => {
|
| 59 |
onChange({
|
|
|
|
| 61 |
// set to "no filter" when (almost) the whole range is selected
|
| 62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
| 63 |
? null
|
| 64 |
+
: [p ** _value[0], p ** _value[1]]
|
| 65 |
+
})
|
| 66 |
+
}, 1000)
|
| 67 |
+
return () => clearTimeout(timer)
|
| 68 |
+
}, [_value, onChange, min, max])
|
| 69 |
return (
|
| 70 |
<div style={{ minWidth: '20rem' }}>
|
| 71 |
<div>{formatSize(p ** _value[0])}</div>
|
|
|
|
| 147 |
}
|
| 148 |
|
| 149 |
const costBodyTemplate = rowData => {
|
| 150 |
+
return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
}
|
| 152 |
|
| 153 |
const getHeaderText = () => {
|
| 154 |
+
// Count languages that have evaluation data (average score available)
|
| 155 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang =>
|
| 156 |
+
lang.average !== null && lang.average !== undefined
|
| 157 |
+
).length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
if (selectedLanguages.length === 0) {
|
| 160 |
return (
|
| 161 |
<span>
|
| 162 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 163 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 164 |
+
Average performance across {evaluatedLanguagesCount} evaluated languages
|
| 165 |
</span>
|
| 166 |
</span>
|
| 167 |
)
|
|
|
|
| 245 |
body={costBodyTemplate}
|
| 246 |
style={{ minWidth: '5rem' }}
|
| 247 |
/>
|
| 248 |
+
{ScoreColumns}
|
| 249 |
</DataTable>
|
| 250 |
)
|
| 251 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -2,28 +2,21 @@ import { Column } from 'primereact/column'
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
-
const { minScore = 0, maxScore = 1
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
-
|
| 10 |
-
// otherwise fall back to global list
|
| 11 |
-
const rowFlagKey = `${field}_is_machine`
|
| 12 |
-
const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
|
| 13 |
-
const isMachineTranslated = hasRowFlag
|
| 14 |
-
? !!rowData[rowFlagKey]
|
| 15 |
-
: machineTranslatedMetrics.includes(field)
|
| 16 |
-
return ScoreField(score, minScore, maxScore, isMachineTranslated)
|
| 17 |
}
|
| 18 |
}
|
| 19 |
|
| 20 |
-
const ScoreColumns =
|
| 21 |
<Column
|
| 22 |
field='average'
|
| 23 |
header='Proficiency'
|
| 24 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 25 |
sortable
|
| 26 |
-
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5
|
| 27 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 28 |
/>,
|
| 29 |
<Column
|
|
@@ -33,8 +26,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 33 |
sortable
|
| 34 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 35 |
minScore: 0,
|
| 36 |
-
maxScore: 0.5
|
| 37 |
-
machineTranslatedMetrics
|
| 38 |
})}
|
| 39 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 40 |
/>,
|
|
@@ -45,8 +37,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 45 |
sortable
|
| 46 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 47 |
minScore: 0,
|
| 48 |
-
maxScore: 0.5
|
| 49 |
-
machineTranslatedMetrics
|
| 50 |
})}
|
| 51 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 52 |
/>,
|
|
@@ -57,8 +48,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 57 |
sortable
|
| 58 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 59 |
minScore: 0,
|
| 60 |
-
maxScore: 0.5
|
| 61 |
-
machineTranslatedMetrics
|
| 62 |
})}
|
| 63 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 64 |
/>,
|
|
@@ -79,8 +69,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 79 |
sortable
|
| 80 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 81 |
minScore: 0,
|
| 82 |
-
maxScore: 1
|
| 83 |
-
machineTranslatedMetrics
|
| 84 |
})}
|
| 85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
/>,
|
|
@@ -91,8 +80,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 91 |
sortable
|
| 92 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 93 |
minScore: 0,
|
| 94 |
-
maxScore: 1
|
| 95 |
-
machineTranslatedMetrics
|
| 96 |
})}
|
| 97 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 98 |
/>,
|
|
@@ -103,8 +91,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 103 |
sortable
|
| 104 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 105 |
minScore: 0,
|
| 106 |
-
maxScore: 1
|
| 107 |
-
machineTranslatedMetrics
|
| 108 |
})}
|
| 109 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 110 |
/>,
|
|
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
+
const { minScore = 0, maxScore = 1 } = options
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
+
return ScoreField(score, minScore, maxScore)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
}
|
| 11 |
}
|
| 12 |
|
| 13 |
+
const ScoreColumns = [
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
header='Proficiency'
|
| 17 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 18 |
sortable
|
| 19 |
+
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 21 |
/>,
|
| 22 |
<Column
|
|
|
|
| 26 |
sortable
|
| 27 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 28 |
minScore: 0,
|
| 29 |
+
maxScore: 0.5
|
|
|
|
| 30 |
})}
|
| 31 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 32 |
/>,
|
|
|
|
| 37 |
sortable
|
| 38 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 39 |
minScore: 0,
|
| 40 |
+
maxScore: 0.5
|
|
|
|
| 41 |
})}
|
| 42 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 43 |
/>,
|
|
|
|
| 48 |
sortable
|
| 49 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 50 |
minScore: 0,
|
| 51 |
+
maxScore: 0.5
|
|
|
|
| 52 |
})}
|
| 53 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 54 |
/>,
|
|
|
|
| 69 |
sortable
|
| 70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 71 |
minScore: 0,
|
| 72 |
+
maxScore: 1
|
|
|
|
| 73 |
})}
|
| 74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 75 |
/>,
|
|
|
|
| 80 |
sortable
|
| 81 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 82 |
minScore: 0,
|
| 83 |
+
maxScore: 1
|
|
|
|
| 84 |
})}
|
| 85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
/>,
|
|
|
|
| 91 |
sortable
|
| 92 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 93 |
minScore: 0,
|
| 94 |
+
maxScore: 1
|
|
|
|
| 95 |
})}
|
| 96 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 97 |
/>,
|
frontend/src/components/ScoreField.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
const ScoreField = (score, minScore, maxScore
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
@@ -50,7 +50,6 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|
| 53 |
-
{isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
|
| 54 |
</span>
|
| 55 |
</div>
|
| 56 |
)
|
|
|
|
| 1 |
+
const ScoreField = (score, minScore, maxScore) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|
|
|
|
| 53 |
</span>
|
| 54 |
</div>
|
| 55 |
)
|
frontend/src/components/SpeakerPlot.js
CHANGED
|
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
-
|
| 77 |
x: 40,
|
| 78 |
y: languages[39].cumSpeakers / 1e6
|
| 79 |
-
})
|
| 80 |
]
|
| 81 |
})
|
| 82 |
containerRef.current.append(plot)
|
|
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
+
Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
| 77 |
x: 40,
|
| 78 |
y: languages[39].cumSpeakers / 1e6
|
| 79 |
+
})
|
| 80 |
]
|
| 81 |
})
|
| 82 |
containerRef.current.append(plot)
|
frontend/src/components/WorldMap.js
CHANGED
|
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
|
|
| 26 |
a =>
|
| 27 |
`${smoothProgressBar(a.population / pop)} ${
|
| 28 |
a.name
|
| 29 |
-
} – ${a.score
|
| 30 |
)
|
| 31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
| 32 |
-
return `${d.properties.ADMIN} – ${cData?.score
|
| 33 |
}
|
| 34 |
|
| 35 |
-
const WorldMap = ({ data, width = 750, height = 500
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
@@ -48,22 +48,8 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
| 51 |
-
// Count languages that have any evaluation data
|
| 52 |
-
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
| 53 |
-
const hasAnyScores = [
|
| 54 |
-
'translation_from_bleu',
|
| 55 |
-
'translation_to_bleu',
|
| 56 |
-
'classification_accuracy',
|
| 57 |
-
'mmlu_accuracy',
|
| 58 |
-
'arc_accuracy',
|
| 59 |
-
'truthfulqa_accuracy',
|
| 60 |
-
'mgsm_accuracy'
|
| 61 |
-
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 62 |
-
return hasAnyScores
|
| 63 |
-
}).length
|
| 64 |
-
|
| 65 |
const plot = Plot.plot({
|
| 66 |
-
subtitle:
|
| 67 |
width: width,
|
| 68 |
height: height,
|
| 69 |
projection: 'equal-earth',
|
|
@@ -75,12 +61,11 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
|
| 75 |
})
|
| 76 |
],
|
| 77 |
color: {
|
| 78 |
-
scheme: '
|
| 79 |
-
unknown: '
|
| 80 |
label: 'Score',
|
| 81 |
legend: true,
|
| 82 |
-
domain: [0, 1]
|
| 83 |
-
pivot: 0.5
|
| 84 |
},
|
| 85 |
style: {
|
| 86 |
fontFamily: 'monospace'
|
|
|
|
| 26 |
a =>
|
| 27 |
`${smoothProgressBar(a.population / pop)} ${
|
| 28 |
a.name
|
| 29 |
+
} – ${a.score.toFixed(2)}`
|
| 30 |
)
|
| 31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
| 32 |
+
return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
|
| 33 |
}
|
| 34 |
|
| 35 |
+
const WorldMap = ({ data, width = 750, height = 500 }) => {
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
const plot = Plot.plot({
|
| 52 |
+
subtitle: 'Language Proficiency Score by Country',
|
| 53 |
width: width,
|
| 54 |
height: height,
|
| 55 |
projection: 'equal-earth',
|
|
|
|
| 61 |
})
|
| 62 |
],
|
| 63 |
color: {
|
| 64 |
+
scheme: 'Greens',
|
| 65 |
+
unknown: 'gray',
|
| 66 |
label: 'Score',
|
| 67 |
legend: true,
|
| 68 |
+
domain: [0, 1]
|
|
|
|
| 69 |
},
|
| 70 |
style: {
|
| 71 |
fontFamily: 'monospace'
|
languages.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
-
"commonvoice_hours":
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
@@ -32,7 +32,7 @@
|
|
| 32 |
"flores_path":"hin_Deva",
|
| 33 |
"fleurs_tag":"hi_in",
|
| 34 |
"commonvoice_hours":16.0,
|
| 35 |
-
"commonvoice_locale":"hi",
|
| 36 |
"in_benchmark":true
|
| 37 |
},
|
| 38 |
{
|
|
@@ -43,7 +43,7 @@
|
|
| 43 |
"family":"Indo-European",
|
| 44 |
"flores_path":"spa_Latn",
|
| 45 |
"fleurs_tag":"es_419",
|
| 46 |
-
"commonvoice_hours":
|
| 47 |
"commonvoice_locale":"es",
|
| 48 |
"in_benchmark":true
|
| 49 |
},
|
|
@@ -79,7 +79,7 @@
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
-
"commonvoice_hours":
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
@@ -103,7 +103,7 @@
|
|
| 103 |
"family":"Indo-European",
|
| 104 |
"flores_path":"por_Latn",
|
| 105 |
"fleurs_tag":"pt_br",
|
| 106 |
-
"commonvoice_hours":
|
| 107 |
"commonvoice_locale":"pt",
|
| 108 |
"in_benchmark":true
|
| 109 |
},
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"family":"Indo-European",
|
| 116 |
"flores_path":"pan_Guru",
|
| 117 |
"fleurs_tag":"pa_in",
|
| 118 |
-
"commonvoice_hours":2.
|
| 119 |
"commonvoice_locale":"pa-IN",
|
| 120 |
"in_benchmark":true
|
| 121 |
},
|
|
@@ -127,7 +127,7 @@
|
|
| 127 |
"family":"Indo-European",
|
| 128 |
"flores_path":"rus_Cyrl",
|
| 129 |
"fleurs_tag":"ru_ru",
|
| 130 |
-
"commonvoice_hours":
|
| 131 |
"commonvoice_locale":"ru",
|
| 132 |
"in_benchmark":true
|
| 133 |
},
|
|
@@ -139,7 +139,7 @@
|
|
| 139 |
"family":"Atlantic-Congo",
|
| 140 |
"flores_path":"swh_Latn",
|
| 141 |
"fleurs_tag":"sw_ke",
|
| 142 |
-
"commonvoice_hours":
|
| 143 |
"commonvoice_locale":"sw",
|
| 144 |
"in_benchmark":true
|
| 145 |
},
|
|
@@ -151,7 +151,7 @@
|
|
| 151 |
"family":"Austronesian",
|
| 152 |
"flores_path":"ind_Latn",
|
| 153 |
"fleurs_tag":"id_id",
|
| 154 |
-
"commonvoice_hours":
|
| 155 |
"commonvoice_locale":"id",
|
| 156 |
"in_benchmark":true
|
| 157 |
},
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
-
"commonvoice_hours":
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
@@ -379,7 +379,7 @@
|
|
| 379 |
"family":"Indo-European",
|
| 380 |
"flores_path":null,
|
| 381 |
"fleurs_tag":"ps_af",
|
| 382 |
-
"commonvoice_hours":
|
| 383 |
"commonvoice_locale":"ps",
|
| 384 |
"in_benchmark":false
|
| 385 |
},
|
|
@@ -439,7 +439,7 @@
|
|
| 439 |
"family":"Indo-European",
|
| 440 |
"flores_path":"pol_Latn",
|
| 441 |
"fleurs_tag":"pl_pl",
|
| 442 |
-
"commonvoice_hours":
|
| 443 |
"commonvoice_locale":"pl",
|
| 444 |
"in_benchmark":true
|
| 445 |
},
|
|
@@ -619,7 +619,7 @@
|
|
| 619 |
"family":"Indo-European",
|
| 620 |
"flores_path":"nld_Latn",
|
| 621 |
"fleurs_tag":"nl_nl",
|
| 622 |
-
"commonvoice_hours":
|
| 623 |
"commonvoice_locale":"nl",
|
| 624 |
"in_benchmark":true
|
| 625 |
},
|
|
@@ -655,7 +655,7 @@
|
|
| 655 |
"family":"Atlantic-Congo",
|
| 656 |
"flores_path":"yor_Latn",
|
| 657 |
"fleurs_tag":"yo_ng",
|
| 658 |
-
"commonvoice_hours":6.
|
| 659 |
"commonvoice_locale":"yo",
|
| 660 |
"in_benchmark":true
|
| 661 |
},
|
|
@@ -979,7 +979,7 @@
|
|
| 979 |
"family":"Turkic",
|
| 980 |
"flores_path":"kaz_Cyrl",
|
| 981 |
"fleurs_tag":"kk_kz",
|
| 982 |
-
"commonvoice_hours":2.
|
| 983 |
"commonvoice_locale":"kk",
|
| 984 |
"in_benchmark":true
|
| 985 |
},
|
|
@@ -1027,7 +1027,7 @@
|
|
| 1027 |
"family":"Uralic",
|
| 1028 |
"flores_path":"hun_Latn",
|
| 1029 |
"fleurs_tag":"hu_hu",
|
| 1030 |
-
"commonvoice_hours":
|
| 1031 |
"commonvoice_locale":"hu",
|
| 1032 |
"in_benchmark":true
|
| 1033 |
},
|
|
@@ -1099,7 +1099,7 @@
|
|
| 1099 |
"family":"Indo-European",
|
| 1100 |
"flores_path":"ckb_Arab",
|
| 1101 |
"fleurs_tag":"ckb_iq",
|
| 1102 |
-
"commonvoice_hours":
|
| 1103 |
"commonvoice_locale":"ckb",
|
| 1104 |
"in_benchmark":true
|
| 1105 |
},
|
|
@@ -1183,7 +1183,7 @@
|
|
| 1183 |
"family":"Indo-European",
|
| 1184 |
"flores_path":"bel_Cyrl",
|
| 1185 |
"fleurs_tag":"be_by",
|
| 1186 |
-
"commonvoice_hours":
|
| 1187 |
"commonvoice_locale":"be",
|
| 1188 |
"in_benchmark":true
|
| 1189 |
},
|
|
@@ -1207,7 +1207,7 @@
|
|
| 1207 |
"family":"Indo-European",
|
| 1208 |
"flores_path":"tgk_Cyrl",
|
| 1209 |
"fleurs_tag":"tg_tj",
|
| 1210 |
-
"commonvoice_hours":0.
|
| 1211 |
"commonvoice_locale":"tg",
|
| 1212 |
"in_benchmark":true
|
| 1213 |
},
|
|
@@ -1243,7 +1243,7 @@
|
|
| 1243 |
"family":"Indo-European",
|
| 1244 |
"flores_path":"afr_Latn",
|
| 1245 |
"fleurs_tag":"af_za",
|
| 1246 |
-
"commonvoice_hours":0.
|
| 1247 |
"commonvoice_locale":"af",
|
| 1248 |
"in_benchmark":true
|
| 1249 |
},
|
|
@@ -1291,7 +1291,7 @@
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
-
"commonvoice_hours":
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
@@ -1303,7 +1303,7 @@
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
-
"commonvoice_hours":
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
@@ -1375,7 +1375,7 @@
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
-
"commonvoice_hours":
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
@@ -1519,7 +1519,7 @@
|
|
| 1519 |
"family":"Indo-European",
|
| 1520 |
"flores_path":"kmr_Latn",
|
| 1521 |
"fleurs_tag":null,
|
| 1522 |
-
"commonvoice_hours":
|
| 1523 |
"commonvoice_locale":"kmr",
|
| 1524 |
"in_benchmark":true
|
| 1525 |
},
|
|
@@ -1555,7 +1555,7 @@
|
|
| 1555 |
"family":"Indo-European",
|
| 1556 |
"flores_path":"slk_Latn",
|
| 1557 |
"fleurs_tag":"sk_sk",
|
| 1558 |
-
"commonvoice_hours":
|
| 1559 |
"commonvoice_locale":"sk",
|
| 1560 |
"in_benchmark":true
|
| 1561 |
},
|
|
@@ -1675,7 +1675,7 @@
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
-
"commonvoice_hours":4.
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
@@ -1747,7 +1747,7 @@
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
-
"commonvoice_hours":
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
@@ -2155,7 +2155,7 @@
|
|
| 2155 |
"family":"Kartvelian",
|
| 2156 |
"flores_path":"kat_Geor",
|
| 2157 |
"fleurs_tag":"ka_ge",
|
| 2158 |
-
"commonvoice_hours":
|
| 2159 |
"commonvoice_locale":"ka",
|
| 2160 |
"in_benchmark":true
|
| 2161 |
},
|
|
@@ -2167,7 +2167,7 @@
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
-
"commonvoice_hours":
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
@@ -2323,7 +2323,7 @@
|
|
| 2323 |
"family":"Dravidian",
|
| 2324 |
"flores_path":null,
|
| 2325 |
"fleurs_tag":null,
|
| 2326 |
-
"commonvoice_hours":
|
| 2327 |
"commonvoice_locale":"brh",
|
| 2328 |
"in_benchmark":false
|
| 2329 |
},
|
|
@@ -2623,7 +2623,7 @@
|
|
| 2623 |
"family":"Indo-European",
|
| 2624 |
"flores_path":null,
|
| 2625 |
"fleurs_tag":null,
|
| 2626 |
-
"commonvoice_hours":
|
| 2627 |
"commonvoice_locale":"haz",
|
| 2628 |
"in_benchmark":false
|
| 2629 |
},
|
|
@@ -2695,7 +2695,7 @@
|
|
| 2695 |
"family":"Indo-European",
|
| 2696 |
"flores_path":"oci_Latn",
|
| 2697 |
"fleurs_tag":"oc_fr",
|
| 2698 |
-
"commonvoice_hours":1.
|
| 2699 |
"commonvoice_locale":"oc",
|
| 2700 |
"in_benchmark":true
|
| 2701 |
},
|
|
@@ -3175,8 +3175,8 @@
|
|
| 3175 |
"family":"Atlantic-Congo",
|
| 3176 |
"flores_path":null,
|
| 3177 |
"fleurs_tag":null,
|
| 3178 |
-
"commonvoice_hours":
|
| 3179 |
-
"commonvoice_locale":
|
| 3180 |
"in_benchmark":false
|
| 3181 |
},
|
| 3182 |
{
|
|
@@ -3319,8 +3319,8 @@
|
|
| 3319 |
"family":"Indo-European",
|
| 3320 |
"flores_path":null,
|
| 3321 |
"fleurs_tag":null,
|
| 3322 |
-
"commonvoice_hours":
|
| 3323 |
-
"commonvoice_locale":
|
| 3324 |
"in_benchmark":false
|
| 3325 |
},
|
| 3326 |
{
|
|
@@ -3331,7 +3331,7 @@
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
-
"commonvoice_hours":
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
@@ -3487,7 +3487,7 @@
|
|
| 3487 |
"family":"Indo-European",
|
| 3488 |
"flores_path":"lvs_Latn",
|
| 3489 |
"fleurs_tag":"lv_lv",
|
| 3490 |
-
"commonvoice_hours":
|
| 3491 |
"commonvoice_locale":"lv",
|
| 3492 |
"in_benchmark":true
|
| 3493 |
},
|
|
@@ -3535,7 +3535,7 @@
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
-
"commonvoice_hours":
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
@@ -3559,7 +3559,7 @@
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
-
"commonvoice_hours":
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
@@ -3679,7 +3679,7 @@
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
-
"commonvoice_hours":
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
@@ -3991,8 +3991,8 @@
|
|
| 3991 |
"family":"Atlantic-Congo",
|
| 3992 |
"flores_path":null,
|
| 3993 |
"fleurs_tag":null,
|
| 3994 |
-
"commonvoice_hours":
|
| 3995 |
-
"commonvoice_locale":
|
| 3996 |
"in_benchmark":false
|
| 3997 |
},
|
| 3998 |
{
|
|
@@ -4099,8 +4099,8 @@
|
|
| 4099 |
"family":"Indo-European",
|
| 4100 |
"flores_path":null,
|
| 4101 |
"fleurs_tag":null,
|
| 4102 |
-
"commonvoice_hours":
|
| 4103 |
-
"commonvoice_locale":
|
| 4104 |
"in_benchmark":false
|
| 4105 |
},
|
| 4106 |
{
|
|
@@ -4351,7 +4351,7 @@
|
|
| 4351 |
"family":"Indo-European",
|
| 4352 |
"flores_path":null,
|
| 4353 |
"fleurs_tag":null,
|
| 4354 |
-
"commonvoice_hours":
|
| 4355 |
"commonvoice_locale":"br",
|
| 4356 |
"in_benchmark":false
|
| 4357 |
},
|
|
@@ -4651,7 +4651,7 @@
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
-
"commonvoice_hours":
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
|
@@ -5011,7 +5011,7 @@
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
-
"commonvoice_hours":
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
|
@@ -7879,7 +7879,7 @@
|
|
| 7879 |
"family":"Artificial Language",
|
| 7880 |
"flores_path":"epo_Latn",
|
| 7881 |
"fleurs_tag":null,
|
| 7882 |
-
"commonvoice_hours":
|
| 7883 |
"commonvoice_locale":"eo",
|
| 7884 |
"in_benchmark":true
|
| 7885 |
},
|
|
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
+
"commonvoice_hours":2674.0,
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
|
|
| 32 |
"flores_path":"hin_Deva",
|
| 33 |
"fleurs_tag":"hi_in",
|
| 34 |
"commonvoice_hours":16.0,
|
| 35 |
+
"commonvoice_locale":"hi-IN",
|
| 36 |
"in_benchmark":true
|
| 37 |
},
|
| 38 |
{
|
|
|
|
| 43 |
"family":"Indo-European",
|
| 44 |
"flores_path":"spa_Latn",
|
| 45 |
"fleurs_tag":"es_419",
|
| 46 |
+
"commonvoice_hours":448.0,
|
| 47 |
"commonvoice_locale":"es",
|
| 48 |
"in_benchmark":true
|
| 49 |
},
|
|
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
+
"commonvoice_hours":1065.0,
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
|
|
| 103 |
"family":"Indo-European",
|
| 104 |
"flores_path":"por_Latn",
|
| 105 |
"fleurs_tag":"pt_br",
|
| 106 |
+
"commonvoice_hours":180.0,
|
| 107 |
"commonvoice_locale":"pt",
|
| 108 |
"in_benchmark":true
|
| 109 |
},
|
|
|
|
| 115 |
"family":"Indo-European",
|
| 116 |
"flores_path":"pan_Guru",
|
| 117 |
"fleurs_tag":"pa_in",
|
| 118 |
+
"commonvoice_hours":2.3,
|
| 119 |
"commonvoice_locale":"pa-IN",
|
| 120 |
"in_benchmark":true
|
| 121 |
},
|
|
|
|
| 127 |
"family":"Indo-European",
|
| 128 |
"flores_path":"rus_Cyrl",
|
| 129 |
"fleurs_tag":"ru_ru",
|
| 130 |
+
"commonvoice_hours":245.0,
|
| 131 |
"commonvoice_locale":"ru",
|
| 132 |
"in_benchmark":true
|
| 133 |
},
|
|
|
|
| 139 |
"family":"Atlantic-Congo",
|
| 140 |
"flores_path":"swh_Latn",
|
| 141 |
"fleurs_tag":"sw_ke",
|
| 142 |
+
"commonvoice_hours":411.0,
|
| 143 |
"commonvoice_locale":"sw",
|
| 144 |
"in_benchmark":true
|
| 145 |
},
|
|
|
|
| 151 |
"family":"Austronesian",
|
| 152 |
"flores_path":"ind_Latn",
|
| 153 |
"fleurs_tag":"id_id",
|
| 154 |
+
"commonvoice_hours":33.0,
|
| 155 |
"commonvoice_locale":"id",
|
| 156 |
"in_benchmark":true
|
| 157 |
},
|
|
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
+
"commonvoice_hours":1369.0,
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
|
|
| 379 |
"family":"Indo-European",
|
| 380 |
"flores_path":null,
|
| 381 |
"fleurs_tag":"ps_af",
|
| 382 |
+
"commonvoice_hours":81.0,
|
| 383 |
"commonvoice_locale":"ps",
|
| 384 |
"in_benchmark":false
|
| 385 |
},
|
|
|
|
| 439 |
"family":"Indo-European",
|
| 440 |
"flores_path":"pol_Latn",
|
| 441 |
"fleurs_tag":"pl_pl",
|
| 442 |
+
"commonvoice_hours":175.0,
|
| 443 |
"commonvoice_locale":"pl",
|
| 444 |
"in_benchmark":true
|
| 445 |
},
|
|
|
|
| 619 |
"family":"Indo-European",
|
| 620 |
"flores_path":"nld_Latn",
|
| 621 |
"fleurs_tag":"nl_nl",
|
| 622 |
+
"commonvoice_hours":120.0,
|
| 623 |
"commonvoice_locale":"nl",
|
| 624 |
"in_benchmark":true
|
| 625 |
},
|
|
|
|
| 655 |
"family":"Atlantic-Congo",
|
| 656 |
"flores_path":"yor_Latn",
|
| 657 |
"fleurs_tag":"yo_ng",
|
| 658 |
+
"commonvoice_hours":6.3,
|
| 659 |
"commonvoice_locale":"yo",
|
| 660 |
"in_benchmark":true
|
| 661 |
},
|
|
|
|
| 979 |
"family":"Turkic",
|
| 980 |
"flores_path":"kaz_Cyrl",
|
| 981 |
"fleurs_tag":"kk_kz",
|
| 982 |
+
"commonvoice_hours":2.2,
|
| 983 |
"commonvoice_locale":"kk",
|
| 984 |
"in_benchmark":true
|
| 985 |
},
|
|
|
|
| 1027 |
"family":"Uralic",
|
| 1028 |
"flores_path":"hun_Latn",
|
| 1029 |
"fleurs_tag":"hu_hu",
|
| 1030 |
+
"commonvoice_hours":93.0,
|
| 1031 |
"commonvoice_locale":"hu",
|
| 1032 |
"in_benchmark":true
|
| 1033 |
},
|
|
|
|
| 1099 |
"family":"Indo-European",
|
| 1100 |
"flores_path":"ckb_Arab",
|
| 1101 |
"fleurs_tag":"ckb_iq",
|
| 1102 |
+
"commonvoice_hours":135.0,
|
| 1103 |
"commonvoice_locale":"ckb",
|
| 1104 |
"in_benchmark":true
|
| 1105 |
},
|
|
|
|
| 1183 |
"family":"Indo-European",
|
| 1184 |
"flores_path":"bel_Cyrl",
|
| 1185 |
"fleurs_tag":"be_by",
|
| 1186 |
+
"commonvoice_hours":1810.0,
|
| 1187 |
"commonvoice_locale":"be",
|
| 1188 |
"in_benchmark":true
|
| 1189 |
},
|
|
|
|
| 1207 |
"family":"Indo-European",
|
| 1208 |
"flores_path":"tgk_Cyrl",
|
| 1209 |
"fleurs_tag":"tg_tj",
|
| 1210 |
+
"commonvoice_hours":0.4,
|
| 1211 |
"commonvoice_locale":"tg",
|
| 1212 |
"in_benchmark":true
|
| 1213 |
},
|
|
|
|
| 1243 |
"family":"Indo-European",
|
| 1244 |
"flores_path":"afr_Latn",
|
| 1245 |
"fleurs_tag":"af_za",
|
| 1246 |
+
"commonvoice_hours":0.5,
|
| 1247 |
"commonvoice_locale":"af",
|
| 1248 |
"in_benchmark":true
|
| 1249 |
},
|
|
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
+
"commonvoice_hours":2863.0,
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
+
"commonvoice_hours":1.4,
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
+
"commonvoice_hours":411.0,
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
|
|
| 1519 |
"family":"Indo-European",
|
| 1520 |
"flores_path":"kmr_Latn",
|
| 1521 |
"fleurs_tag":null,
|
| 1522 |
+
"commonvoice_hours":69.0,
|
| 1523 |
"commonvoice_locale":"kmr",
|
| 1524 |
"in_benchmark":true
|
| 1525 |
},
|
|
|
|
| 1555 |
"family":"Indo-European",
|
| 1556 |
"flores_path":"slk_Latn",
|
| 1557 |
"fleurs_tag":"sk_sk",
|
| 1558 |
+
"commonvoice_hours":51.0,
|
| 1559 |
"commonvoice_locale":"sk",
|
| 1560 |
"in_benchmark":true
|
| 1561 |
},
|
|
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
+
"commonvoice_hours":4.0,
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
+
"commonvoice_hours":0.5,
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
|
|
| 2155 |
"family":"Kartvelian",
|
| 2156 |
"flores_path":"kat_Geor",
|
| 2157 |
"fleurs_tag":"ka_ge",
|
| 2158 |
+
"commonvoice_hours":166.0,
|
| 2159 |
"commonvoice_locale":"ka",
|
| 2160 |
"in_benchmark":true
|
| 2161 |
},
|
|
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
+
"commonvoice_hours":117.0,
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
|
|
| 2323 |
"family":"Dravidian",
|
| 2324 |
"flores_path":null,
|
| 2325 |
"fleurs_tag":null,
|
| 2326 |
+
"commonvoice_hours":1.2,
|
| 2327 |
"commonvoice_locale":"brh",
|
| 2328 |
"in_benchmark":false
|
| 2329 |
},
|
|
|
|
| 2623 |
"family":"Indo-European",
|
| 2624 |
"flores_path":null,
|
| 2625 |
"fleurs_tag":null,
|
| 2626 |
+
"commonvoice_hours":0.9,
|
| 2627 |
"commonvoice_locale":"haz",
|
| 2628 |
"in_benchmark":false
|
| 2629 |
},
|
|
|
|
| 2695 |
"family":"Indo-European",
|
| 2696 |
"flores_path":"oci_Latn",
|
| 2697 |
"fleurs_tag":"oc_fr",
|
| 2698 |
+
"commonvoice_hours":1.8,
|
| 2699 |
"commonvoice_locale":"oc",
|
| 2700 |
"in_benchmark":true
|
| 2701 |
},
|
|
|
|
| 3175 |
"family":"Atlantic-Congo",
|
| 3176 |
"flores_path":null,
|
| 3177 |
"fleurs_tag":null,
|
| 3178 |
+
"commonvoice_hours":null,
|
| 3179 |
+
"commonvoice_locale":null,
|
| 3180 |
"in_benchmark":false
|
| 3181 |
},
|
| 3182 |
{
|
|
|
|
| 3319 |
"family":"Indo-European",
|
| 3320 |
"flores_path":null,
|
| 3321 |
"fleurs_tag":null,
|
| 3322 |
+
"commonvoice_hours":null,
|
| 3323 |
+
"commonvoice_locale":null,
|
| 3324 |
"in_benchmark":false
|
| 3325 |
},
|
| 3326 |
{
|
|
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
+
"commonvoice_hours":8.3,
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
|
|
| 3487 |
"family":"Indo-European",
|
| 3488 |
"flores_path":"lvs_Latn",
|
| 3489 |
"fleurs_tag":"lv_lv",
|
| 3490 |
+
"commonvoice_hours":262.0,
|
| 3491 |
"commonvoice_locale":"lv",
|
| 3492 |
"in_benchmark":true
|
| 3493 |
},
|
|
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
+
"commonvoice_hours":440.0,
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
+
"commonvoice_hours":83.0,
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
+
"commonvoice_hours":0.7,
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
|
|
| 3991 |
"family":"Atlantic-Congo",
|
| 3992 |
"flores_path":null,
|
| 3993 |
"fleurs_tag":null,
|
| 3994 |
+
"commonvoice_hours":null,
|
| 3995 |
+
"commonvoice_locale":null,
|
| 3996 |
"in_benchmark":false
|
| 3997 |
},
|
| 3998 |
{
|
|
|
|
| 4099 |
"family":"Indo-European",
|
| 4100 |
"flores_path":null,
|
| 4101 |
"fleurs_tag":null,
|
| 4102 |
+
"commonvoice_hours":null,
|
| 4103 |
+
"commonvoice_locale":null,
|
| 4104 |
"in_benchmark":false
|
| 4105 |
},
|
| 4106 |
{
|
|
|
|
| 4351 |
"family":"Indo-European",
|
| 4352 |
"flores_path":null,
|
| 4353 |
"fleurs_tag":null,
|
| 4354 |
+
"commonvoice_hours":29.0,
|
| 4355 |
"commonvoice_locale":"br",
|
| 4356 |
"in_benchmark":false
|
| 4357 |
},
|
|
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
+
"commonvoice_hours":30.0,
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
|
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
+
"commonvoice_hours":0.0,
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
|
|
|
| 7879 |
"family":"Artificial Language",
|
| 7880 |
"flores_path":"epo_Latn",
|
| 7881 |
"fleurs_tag":null,
|
| 7882 |
+
"commonvoice_hours":1436.0,
|
| 7883 |
"commonvoice_locale":"eo",
|
| 7884 |
"in_benchmark":true
|
| 7885 |
},
|
models.json
CHANGED
|
@@ -20,15 +20,15 @@
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
| 23 |
-
"id":"anthropic\/claude-3.
|
| 24 |
-
"name":"Claude 3.
|
| 25 |
"provider_name":"Anthropic",
|
| 26 |
"cost":15.0,
|
| 27 |
"hf_id":null,
|
| 28 |
"size":null,
|
| 29 |
"type":"closed-source",
|
| 30 |
"license":null,
|
| 31 |
-
"creation_date":
|
| 32 |
"tasks":[
|
| 33 |
"translation_from",
|
| 34 |
"translation_to",
|
|
@@ -40,15 +40,15 @@
|
|
| 40 |
]
|
| 41 |
},
|
| 42 |
{
|
| 43 |
-
"id":"anthropic\/claude-sonnet
|
| 44 |
-
"name":"Claude Sonnet
|
| 45 |
"provider_name":"Anthropic",
|
| 46 |
"cost":15.0,
|
| 47 |
"hf_id":null,
|
| 48 |
"size":null,
|
| 49 |
"type":"closed-source",
|
| 50 |
"license":null,
|
| 51 |
-
"creation_date":
|
| 52 |
"tasks":[
|
| 53 |
"translation_from",
|
| 54 |
"translation_to",
|
|
@@ -60,15 +60,15 @@
|
|
| 60 |
]
|
| 61 |
},
|
| 62 |
{
|
| 63 |
-
"id":"
|
| 64 |
-
"name":"
|
| 65 |
-
"provider_name":"
|
| 66 |
"cost":15.0,
|
| 67 |
"hf_id":null,
|
| 68 |
"size":null,
|
| 69 |
"type":"closed-source",
|
| 70 |
"license":null,
|
| 71 |
-
"creation_date":
|
| 72 |
"tasks":[
|
| 73 |
"translation_from",
|
| 74 |
"translation_to",
|
|
@@ -83,7 +83,7 @@
|
|
| 83 |
"id":"deepseek\/deepseek-chat",
|
| 84 |
"name":"DeepSeek V3",
|
| 85 |
"provider_name":"DeepSeek",
|
| 86 |
-
"cost":0.
|
| 87 |
"hf_id":"deepseek-ai\/DeepSeek-V3",
|
| 88 |
"size":684531386000.0,
|
| 89 |
"type":"open-source",
|
|
@@ -120,15 +120,35 @@
|
|
| 120 |
]
|
| 121 |
},
|
| 122 |
{
|
| 123 |
-
"id":"deepseek\/deepseek-
|
| 124 |
-
"name":"
|
| 125 |
"provider_name":"DeepSeek",
|
| 126 |
"cost":0.0,
|
| 127 |
-
"hf_id":"deepseek-ai\/DeepSeek-
|
| 128 |
"size":684531386000.0,
|
| 129 |
"type":"open-source",
|
| 130 |
"license":"Mit",
|
| 131 |
-
"creation_date":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
"tasks":[
|
| 133 |
"translation_from",
|
| 134 |
"translation_to",
|
|
@@ -200,15 +220,145 @@
|
|
| 200 |
]
|
| 201 |
},
|
| 202 |
{
|
| 203 |
-
"id":"google\/
|
| 204 |
-
"name":"
|
| 205 |
"provider_name":"Google",
|
| 206 |
-
"cost":0.
|
| 207 |
-
"hf_id":
|
| 208 |
-
"size":
|
| 209 |
-
"type":"
|
| 210 |
-
"license":
|
| 211 |
-
"creation_date":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
"tasks":[
|
| 213 |
"translation_from",
|
| 214 |
"translation_to",
|
|
@@ -240,15 +390,30 @@
|
|
| 240 |
]
|
| 241 |
},
|
| 242 |
{
|
| 243 |
-
"id":"
|
| 244 |
-
"name":"
|
| 245 |
-
"provider_name":"
|
| 246 |
-
"cost":0
|
| 247 |
-
"hf_id":
|
| 248 |
-
"size":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
"type":"open-source",
|
| 250 |
-
"license":"
|
| 251 |
-
"creation_date":
|
| 252 |
"tasks":[
|
| 253 |
"translation_from",
|
| 254 |
"translation_to",
|
|
@@ -260,12 +425,12 @@
|
|
| 260 |
]
|
| 261 |
},
|
| 262 |
{
|
| 263 |
-
"id":"meta-llama\/llama-3-
|
| 264 |
-
"name":"Llama 3
|
| 265 |
"provider_name":"Meta",
|
| 266 |
-
"cost":0.
|
| 267 |
-
"hf_id":"meta-llama\/Meta-Llama-3-
|
| 268 |
-
"size":
|
| 269 |
"type":"open-source",
|
| 270 |
"license":"Llama3",
|
| 271 |
"creation_date":1713312000000,
|
|
@@ -299,6 +464,30 @@
|
|
| 299 |
"mgsm"
|
| 300 |
]
|
| 301 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
{
|
| 303 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
| 304 |
"name":"Llama 3.3 70B Instruct",
|
|
@@ -339,26 +528,6 @@
|
|
| 339 |
"mgsm"
|
| 340 |
]
|
| 341 |
},
|
| 342 |
-
{
|
| 343 |
-
"id":"meta-llama\/llama-guard-3-8b",
|
| 344 |
-
"name":"Llama Guard 3 8B",
|
| 345 |
-
"provider_name":"Llama Guard 3 8B",
|
| 346 |
-
"cost":0.06,
|
| 347 |
-
"hf_id":"meta-llama\/Llama-Guard-3-8B",
|
| 348 |
-
"size":8030261248.0,
|
| 349 |
-
"type":"open-source",
|
| 350 |
-
"license":"Llama3.1",
|
| 351 |
-
"creation_date":1721606400000,
|
| 352 |
-
"tasks":[
|
| 353 |
-
"translation_from",
|
| 354 |
-
"translation_to",
|
| 355 |
-
"classification",
|
| 356 |
-
"mmlu",
|
| 357 |
-
"arc",
|
| 358 |
-
"truthfulqa",
|
| 359 |
-
"mgsm"
|
| 360 |
-
]
|
| 361 |
-
},
|
| 362 |
{
|
| 363 |
"id":"microsoft\/phi-4",
|
| 364 |
"name":"Phi 4",
|
|
@@ -399,26 +568,6 @@
|
|
| 399 |
"mgsm"
|
| 400 |
]
|
| 401 |
},
|
| 402 |
-
{
|
| 403 |
-
"id":"microsoft\/wizardlm-2-8x22b",
|
| 404 |
-
"name":"WizardLM-2 8x22B",
|
| 405 |
-
"provider_name":"WizardLM-2 8x22B",
|
| 406 |
-
"cost":0.48,
|
| 407 |
-
"hf_id":null,
|
| 408 |
-
"size":null,
|
| 409 |
-
"type":"closed-source",
|
| 410 |
-
"license":null,
|
| 411 |
-
"creation_date":1713225600000,
|
| 412 |
-
"tasks":[
|
| 413 |
-
"translation_from",
|
| 414 |
-
"translation_to",
|
| 415 |
-
"classification",
|
| 416 |
-
"mmlu",
|
| 417 |
-
"arc",
|
| 418 |
-
"truthfulqa",
|
| 419 |
-
"mgsm"
|
| 420 |
-
]
|
| 421 |
-
},
|
| 422 |
{
|
| 423 |
"id":"mistralai\/mistral-nemo",
|
| 424 |
"name":"Mistral Nemo",
|
|
@@ -459,26 +608,6 @@
|
|
| 459 |
"mgsm"
|
| 460 |
]
|
| 461 |
},
|
| 462 |
-
{
|
| 463 |
-
"id":"mistralai\/mistral-small-24b-instruct-2501",
|
| 464 |
-
"name":"Mistral Small 3",
|
| 465 |
-
"provider_name":"Mistral",
|
| 466 |
-
"cost":0.0,
|
| 467 |
-
"hf_id":"mistralai\/Mistral-Small-24B-Instruct-2501",
|
| 468 |
-
"size":23572403200.0,
|
| 469 |
-
"type":"open-source",
|
| 470 |
-
"license":"Apache 2.0",
|
| 471 |
-
"creation_date":1738022400000,
|
| 472 |
-
"tasks":[
|
| 473 |
-
"translation_from",
|
| 474 |
-
"translation_to",
|
| 475 |
-
"classification",
|
| 476 |
-
"mmlu",
|
| 477 |
-
"arc",
|
| 478 |
-
"truthfulqa",
|
| 479 |
-
"mgsm"
|
| 480 |
-
]
|
| 481 |
-
},
|
| 482 |
{
|
| 483 |
"id":"mistralai\/mistral-small-3.1-24b-instruct",
|
| 484 |
"name":"Mistral Small 3.1 24B",
|
|
@@ -499,106 +628,6 @@
|
|
| 499 |
"mgsm"
|
| 500 |
]
|
| 501 |
},
|
| 502 |
-
{
|
| 503 |
-
"id":"moonshotai\/kimi-k2",
|
| 504 |
-
"name":"Kimi K2",
|
| 505 |
-
"provider_name":"MoonshotAI",
|
| 506 |
-
"cost":0.0,
|
| 507 |
-
"hf_id":"moonshotai\/Kimi-K2-Instruct",
|
| 508 |
-
"size":null,
|
| 509 |
-
"type":"open-source",
|
| 510 |
-
"license":"Other",
|
| 511 |
-
"creation_date":1752192000000,
|
| 512 |
-
"tasks":[
|
| 513 |
-
"translation_from",
|
| 514 |
-
"translation_to",
|
| 515 |
-
"classification",
|
| 516 |
-
"mmlu",
|
| 517 |
-
"arc",
|
| 518 |
-
"truthfulqa",
|
| 519 |
-
"mgsm"
|
| 520 |
-
]
|
| 521 |
-
},
|
| 522 |
-
{
|
| 523 |
-
"id":"nousresearch\/deephermes-3-llama-3-8b-preview",
|
| 524 |
-
"name":"DeepHermes 3 Llama 3 8B Preview",
|
| 525 |
-
"provider_name":"Nous",
|
| 526 |
-
"cost":0.0,
|
| 527 |
-
"hf_id":"NousResearch\/DeepHermes-3-Llama-3-8B-Preview",
|
| 528 |
-
"size":8030261248.0,
|
| 529 |
-
"type":"open-source",
|
| 530 |
-
"license":"Llama3",
|
| 531 |
-
"creation_date":1739318400000,
|
| 532 |
-
"tasks":[
|
| 533 |
-
"translation_from",
|
| 534 |
-
"translation_to",
|
| 535 |
-
"classification",
|
| 536 |
-
"mmlu",
|
| 537 |
-
"arc",
|
| 538 |
-
"truthfulqa",
|
| 539 |
-
"mgsm"
|
| 540 |
-
]
|
| 541 |
-
},
|
| 542 |
-
{
|
| 543 |
-
"id":"nousresearch\/hermes-2-pro-llama-3-8b",
|
| 544 |
-
"name":"Hermes 2 Pro - Llama-3 8B",
|
| 545 |
-
"provider_name":"NousResearch",
|
| 546 |
-
"cost":0.04,
|
| 547 |
-
"hf_id":"NousResearch\/Hermes-2-Pro-Llama-3-8B",
|
| 548 |
-
"size":8030523392.0,
|
| 549 |
-
"type":"open-source",
|
| 550 |
-
"license":"Llama3",
|
| 551 |
-
"creation_date":1714435200000,
|
| 552 |
-
"tasks":[
|
| 553 |
-
"translation_from",
|
| 554 |
-
"translation_to",
|
| 555 |
-
"classification",
|
| 556 |
-
"mmlu",
|
| 557 |
-
"arc",
|
| 558 |
-
"truthfulqa",
|
| 559 |
-
"mgsm"
|
| 560 |
-
]
|
| 561 |
-
},
|
| 562 |
-
{
|
| 563 |
-
"id":"nousresearch\/hermes-3-llama-3.1-405b",
|
| 564 |
-
"name":"Hermes 3 405B Instruct",
|
| 565 |
-
"provider_name":"Nous",
|
| 566 |
-
"cost":0.8,
|
| 567 |
-
"hf_id":"NousResearch\/Hermes-3-Llama-3.1-405B",
|
| 568 |
-
"size":405853388800.0,
|
| 569 |
-
"type":"open-source",
|
| 570 |
-
"license":"Llama3",
|
| 571 |
-
"creation_date":1723507200000,
|
| 572 |
-
"tasks":[
|
| 573 |
-
"translation_from",
|
| 574 |
-
"translation_to",
|
| 575 |
-
"classification",
|
| 576 |
-
"mmlu",
|
| 577 |
-
"arc",
|
| 578 |
-
"truthfulqa",
|
| 579 |
-
"mgsm"
|
| 580 |
-
]
|
| 581 |
-
},
|
| 582 |
-
{
|
| 583 |
-
"id":"nousresearch\/hermes-3-llama-3.1-70b",
|
| 584 |
-
"name":"Hermes 3 70B Instruct",
|
| 585 |
-
"provider_name":"Nous",
|
| 586 |
-
"cost":0.28,
|
| 587 |
-
"hf_id":"NousResearch\/Hermes-3-Llama-3.1-70B",
|
| 588 |
-
"size":70553706496.0,
|
| 589 |
-
"type":"open-source",
|
| 590 |
-
"license":"Llama3",
|
| 591 |
-
"creation_date":1722211200000,
|
| 592 |
-
"tasks":[
|
| 593 |
-
"translation_from",
|
| 594 |
-
"translation_to",
|
| 595 |
-
"classification",
|
| 596 |
-
"mmlu",
|
| 597 |
-
"arc",
|
| 598 |
-
"truthfulqa",
|
| 599 |
-
"mgsm"
|
| 600 |
-
]
|
| 601 |
-
},
|
| 602 |
{
|
| 603 |
"id":"openai\/gpt-3.5-turbo-0613",
|
| 604 |
"name":"GPT-3.5 Turbo (older v0613)",
|
|
@@ -679,26 +708,6 @@
|
|
| 679 |
"mgsm"
|
| 680 |
]
|
| 681 |
},
|
| 682 |
-
{
|
| 683 |
-
"id":"openai\/gpt-4o-2024-11-20",
|
| 684 |
-
"name":"GPT-4o (2024-11-20)",
|
| 685 |
-
"provider_name":"OpenAI",
|
| 686 |
-
"cost":10.0,
|
| 687 |
-
"hf_id":null,
|
| 688 |
-
"size":null,
|
| 689 |
-
"type":"closed-source",
|
| 690 |
-
"license":null,
|
| 691 |
-
"creation_date":1732060800000,
|
| 692 |
-
"tasks":[
|
| 693 |
-
"translation_from",
|
| 694 |
-
"translation_to",
|
| 695 |
-
"classification",
|
| 696 |
-
"mmlu",
|
| 697 |
-
"arc",
|
| 698 |
-
"truthfulqa",
|
| 699 |
-
"mgsm"
|
| 700 |
-
]
|
| 701 |
-
},
|
| 702 |
{
|
| 703 |
"id":"openai\/gpt-4o-mini",
|
| 704 |
"name":"GPT-4o-mini",
|
|
@@ -719,86 +728,6 @@
|
|
| 719 |
"mgsm"
|
| 720 |
]
|
| 721 |
},
|
| 722 |
-
{
|
| 723 |
-
"id":"openai\/gpt-5",
|
| 724 |
-
"name":"GPT-5",
|
| 725 |
-
"provider_name":"OpenAI",
|
| 726 |
-
"cost":10.0,
|
| 727 |
-
"hf_id":null,
|
| 728 |
-
"size":null,
|
| 729 |
-
"type":"closed-source",
|
| 730 |
-
"license":null,
|
| 731 |
-
"creation_date":1754524800000,
|
| 732 |
-
"tasks":[
|
| 733 |
-
"translation_from",
|
| 734 |
-
"translation_to",
|
| 735 |
-
"classification",
|
| 736 |
-
"mmlu",
|
| 737 |
-
"arc",
|
| 738 |
-
"truthfulqa",
|
| 739 |
-
"mgsm"
|
| 740 |
-
]
|
| 741 |
-
},
|
| 742 |
-
{
|
| 743 |
-
"id":"openai\/gpt-5-nano",
|
| 744 |
-
"name":"GPT-5 Nano",
|
| 745 |
-
"provider_name":"OpenAI",
|
| 746 |
-
"cost":0.4,
|
| 747 |
-
"hf_id":null,
|
| 748 |
-
"size":null,
|
| 749 |
-
"type":"closed-source",
|
| 750 |
-
"license":null,
|
| 751 |
-
"creation_date":1754524800000,
|
| 752 |
-
"tasks":[
|
| 753 |
-
"translation_from",
|
| 754 |
-
"translation_to",
|
| 755 |
-
"classification",
|
| 756 |
-
"mmlu",
|
| 757 |
-
"arc",
|
| 758 |
-
"truthfulqa",
|
| 759 |
-
"mgsm"
|
| 760 |
-
]
|
| 761 |
-
},
|
| 762 |
-
{
|
| 763 |
-
"id":"openai\/gpt-oss-120b",
|
| 764 |
-
"name":"gpt-oss-120b",
|
| 765 |
-
"provider_name":"OpenAI",
|
| 766 |
-
"cost":0.0,
|
| 767 |
-
"hf_id":"openai\/gpt-oss-120b",
|
| 768 |
-
"size":120412337472.0,
|
| 769 |
-
"type":"open-source",
|
| 770 |
-
"license":"Apache 2.0",
|
| 771 |
-
"creation_date":1754265600000,
|
| 772 |
-
"tasks":[
|
| 773 |
-
"translation_from",
|
| 774 |
-
"translation_to",
|
| 775 |
-
"classification",
|
| 776 |
-
"mmlu",
|
| 777 |
-
"arc",
|
| 778 |
-
"truthfulqa",
|
| 779 |
-
"mgsm"
|
| 780 |
-
]
|
| 781 |
-
},
|
| 782 |
-
{
|
| 783 |
-
"id":"openai\/gpt-oss-20b",
|
| 784 |
-
"name":"gpt-oss-20b",
|
| 785 |
-
"provider_name":"OpenAI",
|
| 786 |
-
"cost":0.0,
|
| 787 |
-
"hf_id":"openai\/gpt-oss-20b",
|
| 788 |
-
"size":21511953984.0,
|
| 789 |
-
"type":"open-source",
|
| 790 |
-
"license":"Apache 2.0",
|
| 791 |
-
"creation_date":1754265600000,
|
| 792 |
-
"tasks":[
|
| 793 |
-
"translation_from",
|
| 794 |
-
"translation_to",
|
| 795 |
-
"classification",
|
| 796 |
-
"mmlu",
|
| 797 |
-
"arc",
|
| 798 |
-
"truthfulqa",
|
| 799 |
-
"mgsm"
|
| 800 |
-
]
|
| 801 |
-
},
|
| 802 |
{
|
| 803 |
"id":"qwen\/qwen3-235b-a22b",
|
| 804 |
"name":"Qwen3 235B A22B",
|
|
@@ -843,7 +772,7 @@
|
|
| 843 |
"id":"qwen\/qwen3-32b",
|
| 844 |
"name":"Qwen3 32B",
|
| 845 |
"provider_name":"Qwen",
|
| 846 |
-
"cost":0.
|
| 847 |
"hf_id":"Qwen\/Qwen3-32B",
|
| 848 |
"size":32762123264.0,
|
| 849 |
"type":"open-source",
|
|
@@ -858,140 +787,5 @@
|
|
| 858 |
"truthfulqa",
|
| 859 |
"mgsm"
|
| 860 |
]
|
| 861 |
-
},
|
| 862 |
-
{
|
| 863 |
-
"id":"sao10k\/l3-lunaris-8b",
|
| 864 |
-
"name":"Llama 3 8B Lunaris",
|
| 865 |
-
"provider_name":"Sao10K",
|
| 866 |
-
"cost":0.05,
|
| 867 |
-
"hf_id":"Sao10K\/L3-8B-Lunaris-v1",
|
| 868 |
-
"size":8030261248.0,
|
| 869 |
-
"type":"open-source",
|
| 870 |
-
"license":"Llama3",
|
| 871 |
-
"creation_date":1719360000000,
|
| 872 |
-
"tasks":[
|
| 873 |
-
"translation_from",
|
| 874 |
-
"translation_to",
|
| 875 |
-
"classification",
|
| 876 |
-
"mmlu",
|
| 877 |
-
"arc",
|
| 878 |
-
"truthfulqa",
|
| 879 |
-
"mgsm"
|
| 880 |
-
]
|
| 881 |
-
},
|
| 882 |
-
{
|
| 883 |
-
"id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
| 884 |
-
"name":"Typhoon2 70B Instruct",
|
| 885 |
-
"provider_name":"Typhoon2 70B Instruct",
|
| 886 |
-
"cost":0.88,
|
| 887 |
-
"hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
| 888 |
-
"size":70553706496.0,
|
| 889 |
-
"type":"open-source",
|
| 890 |
-
"license":"Llama3.1",
|
| 891 |
-
"creation_date":1734220800000,
|
| 892 |
-
"tasks":[
|
| 893 |
-
"translation_from",
|
| 894 |
-
"translation_to",
|
| 895 |
-
"classification",
|
| 896 |
-
"mmlu",
|
| 897 |
-
"arc",
|
| 898 |
-
"truthfulqa",
|
| 899 |
-
"mgsm"
|
| 900 |
-
]
|
| 901 |
-
},
|
| 902 |
-
{
|
| 903 |
-
"id":"shisa-ai\/shisa-v2-llama3.3-70b",
|
| 904 |
-
"name":"Shisa V2 Llama 3.3 70B ",
|
| 905 |
-
"provider_name":"Shisa AI",
|
| 906 |
-
"cost":0.0,
|
| 907 |
-
"hf_id":"shisa-ai\/shisa-v2-llama3.3-70b",
|
| 908 |
-
"size":70553706496.0,
|
| 909 |
-
"type":"open-source",
|
| 910 |
-
"license":"Llama3.3",
|
| 911 |
-
"creation_date":1744502400000,
|
| 912 |
-
"tasks":[
|
| 913 |
-
"translation_from",
|
| 914 |
-
"translation_to",
|
| 915 |
-
"classification",
|
| 916 |
-
"mmlu",
|
| 917 |
-
"arc",
|
| 918 |
-
"truthfulqa",
|
| 919 |
-
"mgsm"
|
| 920 |
-
]
|
| 921 |
-
},
|
| 922 |
-
{
|
| 923 |
-
"id":"x-ai\/grok-2-vision-1212",
|
| 924 |
-
"name":"Grok 2 Vision 1212",
|
| 925 |
-
"provider_name":"xAI",
|
| 926 |
-
"cost":10.0,
|
| 927 |
-
"hf_id":null,
|
| 928 |
-
"size":null,
|
| 929 |
-
"type":"closed-source",
|
| 930 |
-
"license":null,
|
| 931 |
-
"creation_date":1734220800000,
|
| 932 |
-
"tasks":[
|
| 933 |
-
"translation_from",
|
| 934 |
-
"translation_to",
|
| 935 |
-
"classification",
|
| 936 |
-
"mmlu",
|
| 937 |
-
"arc",
|
| 938 |
-
"truthfulqa",
|
| 939 |
-
"mgsm"
|
| 940 |
-
]
|
| 941 |
-
},
|
| 942 |
-
{
|
| 943 |
-
"id":"x-ai\/grok-4",
|
| 944 |
-
"name":"Grok 4",
|
| 945 |
-
"provider_name":"xAI",
|
| 946 |
-
"cost":15.0,
|
| 947 |
-
"hf_id":null,
|
| 948 |
-
"size":null,
|
| 949 |
-
"type":"closed-source",
|
| 950 |
-
"license":null,
|
| 951 |
-
"creation_date":1752019200000,
|
| 952 |
-
"tasks":[
|
| 953 |
-
"translation_from",
|
| 954 |
-
"translation_to",
|
| 955 |
-
"classification",
|
| 956 |
-
"mmlu",
|
| 957 |
-
"arc",
|
| 958 |
-
"truthfulqa",
|
| 959 |
-
"mgsm"
|
| 960 |
-
]
|
| 961 |
-
},
|
| 962 |
-
{
|
| 963 |
-
"id":"z-ai\/glm-4.5",
|
| 964 |
-
"name":"GLM 4.5",
|
| 965 |
-
"provider_name":"Z.AI",
|
| 966 |
-
"cost":1.32,
|
| 967 |
-
"hf_id":"zai-org\/GLM-4.5",
|
| 968 |
-
"size":358337791296.0,
|
| 969 |
-
"type":"open-source",
|
| 970 |
-
"license":"Mit",
|
| 971 |
-
"creation_date":1752969600000,
|
| 972 |
-
"tasks":[
|
| 973 |
-
"translation_from",
|
| 974 |
-
"translation_to",
|
| 975 |
-
"classification",
|
| 976 |
-
"mmlu",
|
| 977 |
-
"arc",
|
| 978 |
-
"truthfulqa",
|
| 979 |
-
"mgsm"
|
| 980 |
-
]
|
| 981 |
-
},
|
| 982 |
-
{
|
| 983 |
-
"id":"google\/translate-v2",
|
| 984 |
-
"name":"Google Translate",
|
| 985 |
-
"provider_name":"Google",
|
| 986 |
-
"cost":20.0,
|
| 987 |
-
"hf_id":null,
|
| 988 |
-
"size":null,
|
| 989 |
-
"type":"closed-source",
|
| 990 |
-
"license":null,
|
| 991 |
-
"creation_date":null,
|
| 992 |
-
"tasks":[
|
| 993 |
-
"translation_from",
|
| 994 |
-
"translation_to"
|
| 995 |
-
]
|
| 996 |
}
|
| 997 |
]
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
| 23 |
+
"id":"anthropic\/claude-3.5-sonnet",
|
| 24 |
+
"name":"Claude 3.5 Sonnet",
|
| 25 |
"provider_name":"Anthropic",
|
| 26 |
"cost":15.0,
|
| 27 |
"hf_id":null,
|
| 28 |
"size":null,
|
| 29 |
"type":"closed-source",
|
| 30 |
"license":null,
|
| 31 |
+
"creation_date":1729555200000,
|
| 32 |
"tasks":[
|
| 33 |
"translation_from",
|
| 34 |
"translation_to",
|
|
|
|
| 40 |
]
|
| 41 |
},
|
| 42 |
{
|
| 43 |
+
"id":"anthropic\/claude-3.7-sonnet",
|
| 44 |
+
"name":"Claude 3.7 Sonnet",
|
| 45 |
"provider_name":"Anthropic",
|
| 46 |
"cost":15.0,
|
| 47 |
"hf_id":null,
|
| 48 |
"size":null,
|
| 49 |
"type":"closed-source",
|
| 50 |
"license":null,
|
| 51 |
+
"creation_date":1740355200000,
|
| 52 |
"tasks":[
|
| 53 |
"translation_from",
|
| 54 |
"translation_to",
|
|
|
|
| 60 |
]
|
| 61 |
},
|
| 62 |
{
|
| 63 |
+
"id":"anthropic\/claude-sonnet-4",
|
| 64 |
+
"name":"Claude Sonnet 4",
|
| 65 |
+
"provider_name":"Anthropic",
|
| 66 |
"cost":15.0,
|
| 67 |
"hf_id":null,
|
| 68 |
"size":null,
|
| 69 |
"type":"closed-source",
|
| 70 |
"license":null,
|
| 71 |
+
"creation_date":1747872000000,
|
| 72 |
"tasks":[
|
| 73 |
"translation_from",
|
| 74 |
"translation_to",
|
|
|
|
| 83 |
"id":"deepseek\/deepseek-chat",
|
| 84 |
"name":"DeepSeek V3",
|
| 85 |
"provider_name":"DeepSeek",
|
| 86 |
+
"cost":0.0,
|
| 87 |
"hf_id":"deepseek-ai\/DeepSeek-V3",
|
| 88 |
"size":684531386000.0,
|
| 89 |
"type":"open-source",
|
|
|
|
| 120 |
]
|
| 121 |
},
|
| 122 |
{
|
| 123 |
+
"id":"deepseek\/deepseek-r1",
|
| 124 |
+
"name":"R1",
|
| 125 |
"provider_name":"DeepSeek",
|
| 126 |
"cost":0.0,
|
| 127 |
+
"hf_id":"deepseek-ai\/DeepSeek-R1",
|
| 128 |
"size":684531386000.0,
|
| 129 |
"type":"open-source",
|
| 130 |
"license":"Mit",
|
| 131 |
+
"creation_date":1737331200000,
|
| 132 |
+
"tasks":[
|
| 133 |
+
"translation_from",
|
| 134 |
+
"translation_to",
|
| 135 |
+
"classification",
|
| 136 |
+
"mmlu",
|
| 137 |
+
"arc",
|
| 138 |
+
"truthfulqa",
|
| 139 |
+
"mgsm"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"id":"deepseek\/deepseek-r1-0528",
|
| 144 |
+
"name":"R1 0528",
|
| 145 |
+
"provider_name":"DeepSeek",
|
| 146 |
+
"cost":0.0,
|
| 147 |
+
"hf_id":"deepseek-ai\/DeepSeek-R1-0528",
|
| 148 |
+
"size":684531386000.0,
|
| 149 |
+
"type":"open-source",
|
| 150 |
+
"license":"Mit",
|
| 151 |
+
"creation_date":1748390400000.0,
|
| 152 |
"tasks":[
|
| 153 |
"translation_from",
|
| 154 |
"translation_to",
|
|
|
|
| 220 |
]
|
| 221 |
},
|
| 222 |
{
|
| 223 |
+
"id":"google\/gemini-2.5-flash-lite-preview-06-17",
|
| 224 |
+
"name":"Gemini 2.5 Flash Lite Preview 06-17",
|
| 225 |
"provider_name":"Google",
|
| 226 |
+
"cost":0.4,
|
| 227 |
+
"hf_id":null,
|
| 228 |
+
"size":null,
|
| 229 |
+
"type":"closed-source",
|
| 230 |
+
"license":null,
|
| 231 |
+
"creation_date":1750118400000.0,
|
| 232 |
+
"tasks":[
|
| 233 |
+
"translation_from",
|
| 234 |
+
"translation_to",
|
| 235 |
+
"classification",
|
| 236 |
+
"mmlu",
|
| 237 |
+
"mgsm"
|
| 238 |
+
]
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"id":"google\/gemini-2.5-flash-preview",
|
| 242 |
+
"name":"Gemini 2.5 Flash Preview 04-17",
|
| 243 |
+
"provider_name":"Google",
|
| 244 |
+
"cost":0.6,
|
| 245 |
+
"hf_id":null,
|
| 246 |
+
"size":null,
|
| 247 |
+
"type":"closed-source",
|
| 248 |
+
"license":null,
|
| 249 |
+
"creation_date":1744848000000.0,
|
| 250 |
+
"tasks":[
|
| 251 |
+
"translation_from",
|
| 252 |
+
"translation_to",
|
| 253 |
+
"classification",
|
| 254 |
+
"mmlu",
|
| 255 |
+
"mgsm"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"id":"google\/gemini-2.5-flash-preview-05-20",
|
| 260 |
+
"name":"Gemini 2.5 Flash Preview 05-20",
|
| 261 |
+
"provider_name":"Google",
|
| 262 |
+
"cost":0.6,
|
| 263 |
+
"hf_id":null,
|
| 264 |
+
"size":null,
|
| 265 |
+
"type":"closed-source",
|
| 266 |
+
"license":null,
|
| 267 |
+
"creation_date":1747699200000.0,
|
| 268 |
+
"tasks":[
|
| 269 |
+
"translation_from",
|
| 270 |
+
"translation_to",
|
| 271 |
+
"classification",
|
| 272 |
+
"mmlu",
|
| 273 |
+
"mgsm"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"id":"google\/gemini-2.5-pro",
|
| 278 |
+
"name":"Gemini 2.5 Pro",
|
| 279 |
+
"provider_name":"Google",
|
| 280 |
+
"cost":10.0,
|
| 281 |
+
"hf_id":null,
|
| 282 |
+
"size":null,
|
| 283 |
+
"type":"closed-source",
|
| 284 |
+
"license":null,
|
| 285 |
+
"creation_date":1750118400000,
|
| 286 |
+
"tasks":[
|
| 287 |
+
"translation_from",
|
| 288 |
+
"translation_to",
|
| 289 |
+
"classification",
|
| 290 |
+
"mmlu",
|
| 291 |
+
"arc",
|
| 292 |
+
"truthfulqa",
|
| 293 |
+
"mgsm"
|
| 294 |
+
]
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"id":"google\/gemini-2.5-pro-preview",
|
| 298 |
+
"name":"Gemini 2.5 Pro Preview 06-05",
|
| 299 |
+
"provider_name":"Google",
|
| 300 |
+
"cost":10.0,
|
| 301 |
+
"hf_id":null,
|
| 302 |
+
"size":null,
|
| 303 |
+
"type":"closed-source",
|
| 304 |
+
"license":null,
|
| 305 |
+
"creation_date":1749081600000.0,
|
| 306 |
+
"tasks":[
|
| 307 |
+
"translation_from",
|
| 308 |
+
"translation_to",
|
| 309 |
+
"classification",
|
| 310 |
+
"mmlu",
|
| 311 |
+
"mgsm"
|
| 312 |
+
]
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"id":"google\/gemini-2.5-pro-preview-05-06",
|
| 316 |
+
"name":"Gemini 2.5 Pro Preview 05-06",
|
| 317 |
+
"provider_name":"Google",
|
| 318 |
+
"cost":10.0,
|
| 319 |
+
"hf_id":null,
|
| 320 |
+
"size":null,
|
| 321 |
+
"type":"closed-source",
|
| 322 |
+
"license":null,
|
| 323 |
+
"creation_date":1746576000000.0,
|
| 324 |
+
"tasks":[
|
| 325 |
+
"translation_from",
|
| 326 |
+
"translation_to",
|
| 327 |
+
"classification",
|
| 328 |
+
"mmlu",
|
| 329 |
+
"mgsm"
|
| 330 |
+
]
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"id":"google\/gemini-flash-1.5",
|
| 334 |
+
"name":"Gemini 1.5 Flash ",
|
| 335 |
+
"provider_name":"Google",
|
| 336 |
+
"cost":0.3,
|
| 337 |
+
"hf_id":null,
|
| 338 |
+
"size":null,
|
| 339 |
+
"type":"closed-source",
|
| 340 |
+
"license":null,
|
| 341 |
+
"creation_date":1715644800000,
|
| 342 |
+
"tasks":[
|
| 343 |
+
"translation_from",
|
| 344 |
+
"translation_to",
|
| 345 |
+
"classification",
|
| 346 |
+
"mmlu",
|
| 347 |
+
"arc",
|
| 348 |
+
"truthfulqa",
|
| 349 |
+
"mgsm"
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"id":"google\/gemini-flash-1.5-8b",
|
| 354 |
+
"name":"Gemini 1.5 Flash 8B",
|
| 355 |
+
"provider_name":"Google",
|
| 356 |
+
"cost":0.15,
|
| 357 |
+
"hf_id":null,
|
| 358 |
+
"size":null,
|
| 359 |
+
"type":"closed-source",
|
| 360 |
+
"license":null,
|
| 361 |
+
"creation_date":1727913600000,
|
| 362 |
"tasks":[
|
| 363 |
"translation_from",
|
| 364 |
"translation_to",
|
|
|
|
| 390 |
]
|
| 391 |
},
|
| 392 |
{
|
| 393 |
+
"id":"google\/translate-v2",
|
| 394 |
+
"name":"Google Translate",
|
| 395 |
+
"provider_name":"Google",
|
| 396 |
+
"cost":20.0,
|
| 397 |
+
"hf_id":null,
|
| 398 |
+
"size":null,
|
| 399 |
+
"type":"closed-source",
|
| 400 |
+
"license":null,
|
| 401 |
+
"creation_date":null,
|
| 402 |
+
"tasks":[
|
| 403 |
+
"translation_from",
|
| 404 |
+
"translation_to"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"id":"gryphe\/mythomax-l2-13b",
|
| 409 |
+
"name":"MythoMax 13B",
|
| 410 |
+
"provider_name":"MythoMax 13B",
|
| 411 |
+
"cost":0.07,
|
| 412 |
+
"hf_id":"Gryphe\/MythoMax-L2-13b",
|
| 413 |
+
"size":null,
|
| 414 |
"type":"open-source",
|
| 415 |
+
"license":"Other",
|
| 416 |
+
"creation_date":1691625600000,
|
| 417 |
"tasks":[
|
| 418 |
"translation_from",
|
| 419 |
"translation_to",
|
|
|
|
| 425 |
]
|
| 426 |
},
|
| 427 |
{
|
| 428 |
+
"id":"meta-llama\/llama-3-70b-instruct",
|
| 429 |
+
"name":"Llama 3 70B Instruct",
|
| 430 |
"provider_name":"Meta",
|
| 431 |
+
"cost":0.4,
|
| 432 |
+
"hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
|
| 433 |
+
"size":70553706496.0,
|
| 434 |
"type":"open-source",
|
| 435 |
"license":"Llama3",
|
| 436 |
"creation_date":1713312000000,
|
|
|
|
| 464 |
"mgsm"
|
| 465 |
]
|
| 466 |
},
|
| 467 |
+
{
|
| 468 |
+
"id":"meta-llama\/llama-3.1-8b-instruct",
|
| 469 |
+
"name":"Llama 3.1 8B Instruct",
|
| 470 |
+
"provider_name":"Meta",
|
| 471 |
+
"cost":0.0,
|
| 472 |
+
"hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
|
| 473 |
+
"size":8030261248.0,
|
| 474 |
+
"type":"open-source",
|
| 475 |
+
"license":"Llama3.1",
|
| 476 |
+
"creation_date":1721260800000.0,
|
| 477 |
+
"tasks":null
|
| 478 |
+
},
|
| 479 |
+
{
|
| 480 |
+
"id":"meta-llama\/llama-3.2-1b-instruct",
|
| 481 |
+
"name":"Llama 3.2 1B Instruct",
|
| 482 |
+
"provider_name":"Meta",
|
| 483 |
+
"cost":0.0,
|
| 484 |
+
"hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
|
| 485 |
+
"size":1235814400.0,
|
| 486 |
+
"type":"open-source",
|
| 487 |
+
"license":"Llama3.2",
|
| 488 |
+
"creation_date":1726617600000.0,
|
| 489 |
+
"tasks":null
|
| 490 |
+
},
|
| 491 |
{
|
| 492 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
| 493 |
"name":"Llama 3.3 70B Instruct",
|
|
|
|
| 528 |
"mgsm"
|
| 529 |
]
|
| 530 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
{
|
| 532 |
"id":"microsoft\/phi-4",
|
| 533 |
"name":"Phi 4",
|
|
|
|
| 568 |
"mgsm"
|
| 569 |
]
|
| 570 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
{
|
| 572 |
"id":"mistralai\/mistral-nemo",
|
| 573 |
"name":"Mistral Nemo",
|
|
|
|
| 608 |
"mgsm"
|
| 609 |
]
|
| 610 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
{
|
| 612 |
"id":"mistralai\/mistral-small-3.1-24b-instruct",
|
| 613 |
"name":"Mistral Small 3.1 24B",
|
|
|
|
| 628 |
"mgsm"
|
| 629 |
]
|
| 630 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
{
|
| 632 |
"id":"openai\/gpt-3.5-turbo-0613",
|
| 633 |
"name":"GPT-3.5 Turbo (older v0613)",
|
|
|
|
| 708 |
"mgsm"
|
| 709 |
]
|
| 710 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
{
|
| 712 |
"id":"openai\/gpt-4o-mini",
|
| 713 |
"name":"GPT-4o-mini",
|
|
|
|
| 728 |
"mgsm"
|
| 729 |
]
|
| 730 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
{
|
| 732 |
"id":"qwen\/qwen3-235b-a22b",
|
| 733 |
"name":"Qwen3 235B A22B",
|
|
|
|
| 772 |
"id":"qwen\/qwen3-32b",
|
| 773 |
"name":"Qwen3 32B",
|
| 774 |
"provider_name":"Qwen",
|
| 775 |
+
"cost":0.0,
|
| 776 |
"hf_id":"Qwen\/Qwen3-32B",
|
| 777 |
"size":32762123264.0,
|
| 778 |
"type":"open-source",
|
|
|
|
| 787 |
"truthfulqa",
|
| 788 |
"mgsm"
|
| 789 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
}
|
| 791 |
]
|
pyproject.toml
CHANGED
|
@@ -36,9 +36,6 @@ dev = [
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
| 39 |
-
cloud = [
|
| 40 |
-
"google-cloud-storage>=3.2.0",
|
| 41 |
-
]
|
| 42 |
|
| 43 |
[dependency-groups]
|
| 44 |
dev = [
|
|
@@ -47,10 +44,3 @@ dev = [
|
|
| 47 |
"scipy>=1.16.0",
|
| 48 |
"seaborn>=0.13.2",
|
| 49 |
]
|
| 50 |
-
|
| 51 |
-
[build-system]
|
| 52 |
-
requires = ["hatchling"]
|
| 53 |
-
build-backend = "hatchling.build"
|
| 54 |
-
|
| 55 |
-
[tool.hatch.build.targets.wheel]
|
| 56 |
-
packages = ["evals"]
|
|
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
[dependency-groups]
|
| 41 |
dev = [
|
|
|
|
| 44 |
"scipy>=1.16.0",
|
| 45 |
"seaborn>=0.13.2",
|
| 46 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8dbe020a1941a0e49c05f81aeee40ba37d3e2f9f3d83303fcfe1b5711676d1d8
|
| 3 |
+
size 2978273
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|