Spaces:
Running
Running
Upload from GitHub Actions: Merge pull request #9 from datenlabor-bmz/jn-dev
Browse files- .gitattributes +1 -0
- .gitignore +3 -0
- Dockerfile.eval +71 -0
- README.md +135 -0
- cloudbuild.yaml +5 -0
- deploy_eval.sh +29 -0
- evals/README.md +82 -0
- evals/backend.py +53 -36
- evals/datasets_/__init__.py +1 -0
- evals/datasets_/arc.py +33 -17
- evals/datasets_/mgsm.py +14 -9
- evals/datasets_/mmlu.py +49 -9
- evals/datasets_/truthfulqa.py +43 -6
- evals/datasets_/util.py +7 -0
- evals/main.py +137 -35
- evals/main_gcs.py +213 -0
- evals/models.py +68 -13
- evals/tasks.py +95 -79
- frontend/src/App.js +5 -1
- frontend/src/components/ModelTable.js +17 -7
- frontend/src/components/ScoreColumns.js +17 -10
- frontend/src/components/ScoreField.js +2 -1
- frontend/src/components/WorldMap.js +16 -2
- languages.json +17 -17
- models.json +1085 -85
- pyproject.toml +10 -0
- results.json +0 -0
- system_architecture_diagram.md +90 -56
- uv.lock +0 -0
.gitattributes
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
evals/data_flow_architecture.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 1 |
evals/data_flow_architecture.png filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
results.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -20,3 +20,6 @@ wheels/
|
|
| 20 |
# folders and files to be ignored
|
| 21 |
.specstory/
|
| 22 |
.cursorindexingignore
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# folders and files to be ignored
|
| 21 |
.specstory/
|
| 22 |
.cursorindexingignore
|
| 23 |
+
|
| 24 |
+
# Project-specific files
|
| 25 |
+
.dockerignore.eval
|
Dockerfile.eval
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
curl \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements and install Python dependencies
|
| 12 |
+
COPY pyproject.toml uv.lock ./
|
| 13 |
+
RUN pip install uv && uv sync --frozen
|
| 14 |
+
|
| 15 |
+
# Copy application code
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Verify dependencies are installed
|
| 19 |
+
RUN .venv/bin/python -c "import pandas, datasets, evaluate, fastapi, uvicorn, google.cloud.storage, google.cloud.translate, dotenv, elevenlabs, huggingface_hub, joblib, language_data, openai, requests, scipy, aiolimiter, sentencepiece, langcodes, rich, tqdm; print('β
All dependencies verified')"
|
| 20 |
+
|
| 21 |
+
# Set environment variables with conservative limits
|
| 22 |
+
ENV N_SENTENCES=20
|
| 23 |
+
ENV MAX_LANGUAGES=150
|
| 24 |
+
ENV COST_LIMIT_USD=20
|
| 25 |
+
|
| 26 |
+
# Create a startup script with cost monitoring and HTTP server
|
| 27 |
+
RUN echo '#!/bin/bash\n\
|
| 28 |
+
\n\
|
| 29 |
+
# Force immediate log flushing for Cloud Run visibility\n\
|
| 30 |
+
export PYTHONUNBUFFERED=1\n\
|
| 31 |
+
export PYTHONIOENCODING=utf-8\n\
|
| 32 |
+
\n\
|
| 33 |
+
echo "π Starting AI Language Evaluation..."\n\
|
| 34 |
+
echo "π Configuration: $N_SENTENCES sentences, $MAX_LANGUAGES languages"\n\
|
| 35 |
+
echo "π° Cost limit: $COST_LIMIT_USD USD"\n\
|
| 36 |
+
echo "π‘οΈ Cost protection enabled"\n\
|
| 37 |
+
echo "π§ Logging: Unbuffered Python output enabled"\n\
|
| 38 |
+
\n\
|
| 39 |
+
# Start a simple HTTP server to satisfy Cloud Run requirements\n\
|
| 40 |
+
python -m http.server 8080 &\n\
|
| 41 |
+
HTTP_SERVER_PID=$!\n\
|
| 42 |
+
\n\
|
| 43 |
+
# Start cost monitoring in background\n\
|
| 44 |
+
(\n\
|
| 45 |
+
start_time=$(date +%s)\n\
|
| 46 |
+
while true; do\n\
|
| 47 |
+
current_time=$(date +%s)\n\
|
| 48 |
+
elapsed_hours=$(( (current_time - start_time) / 3600 ))\n\
|
| 49 |
+
if [ $elapsed_hours -ge 24 ]; then\n\
|
| 50 |
+
echo "β οΈ MAX RUNTIME REACHED! Stopping evaluation..."\n\
|
| 51 |
+
pkill -f "python evals/main_gcs.py"\n\
|
| 52 |
+
break\n\
|
| 53 |
+
fi\n\
|
| 54 |
+
sleep 300 # Check every 5 minutes\n\
|
| 55 |
+
done\n\
|
| 56 |
+
) &\n\
|
| 57 |
+
\n\
|
| 58 |
+
# Run the evaluation with forced log flushing\n\
|
| 59 |
+
cd /app && .venv/bin/python -u evals/main_gcs.py\n\
|
| 60 |
+
\n\
|
| 61 |
+
# Stop the HTTP server\n\
|
| 62 |
+
kill $HTTP_SERVER_PID\n\
|
| 63 |
+
\n\
|
| 64 |
+
echo "β
Evaluation completed!"\n\
|
| 65 |
+
' > /app/start.sh && chmod +x /app/start.sh
|
| 66 |
+
|
| 67 |
+
# Expose port (for Cloud Run requirements)
|
| 68 |
+
EXPOSE 8080
|
| 69 |
+
|
| 70 |
+
# Run the evaluation with resource limits
|
| 71 |
+
CMD ["/app/start.sh"]
|
README.md
CHANGED
|
@@ -43,12 +43,147 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
## Evaluate
|
| 47 |
|
|
|
|
| 48 |
```bash
|
| 49 |
uv run --extra dev evals/main.py
|
| 50 |
```
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
## Explore
|
| 53 |
|
| 54 |
```bash
|
|
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
| 46 |
+
## System Architecture
|
| 47 |
+
|
| 48 |
+
The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
|
| 49 |
+
|
| 50 |
+
```mermaid
|
| 51 |
+
flowchart TD
|
| 52 |
+
%% Model Sources
|
| 53 |
+
A1["important_models<br/>Static Curated List"] --> D[load_models]
|
| 54 |
+
A2["get_historical_popular_models<br/>Web Scraping - Top 20"] --> D
|
| 55 |
+
A3["get_current_popular_models<br/>Web Scraping - Top 10"] --> D
|
| 56 |
+
A4["blocklist<br/>Exclusions"] --> D
|
| 57 |
+
|
| 58 |
+
%% Model Processing
|
| 59 |
+
D --> |"Combine & Dedupe"| E["Dynamic Model List<br/>~40-50 models"]
|
| 60 |
+
E --> |get_or_metadata| F["OpenRouter API<br/>Model Metadata"]
|
| 61 |
+
F --> |get_hf_metadata| G["HuggingFace API<br/>Model Details"]
|
| 62 |
+
G --> H["Enriched Model DataFrame"]
|
| 63 |
+
H --> |Save| I[models.json]
|
| 64 |
+
|
| 65 |
+
%% Model Validation & Cost Filtering
|
| 66 |
+
H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost β€ $20/1M tokens"]
|
| 67 |
+
H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
|
| 68 |
+
|
| 69 |
+
%% Language Data
|
| 70 |
+
J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
|
| 71 |
+
|
| 72 |
+
%% Task Registry with Unified Prompting
|
| 73 |
+
L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
|
| 74 |
+
M --> M1["translation_from/to<br/>BLEU + ChrF"]
|
| 75 |
+
M --> M2["classification<br/>Accuracy"]
|
| 76 |
+
M --> M3["mmlu<br/>Accuracy"]
|
| 77 |
+
M --> M4["arc<br/>Accuracy"]
|
| 78 |
+
M --> M5["truthfulqa<br/>Accuracy"]
|
| 79 |
+
M --> M6["mgsm<br/>Accuracy"]
|
| 80 |
+
|
| 81 |
+
%% On-the-fly Translation with Origin Tagging
|
| 82 |
+
subgraph OTF [On-the-fly Dataset Translation]
|
| 83 |
+
direction LR
|
| 84 |
+
DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
|
| 85 |
+
Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
|
| 86 |
+
DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
|
| 87 |
+
end
|
| 88 |
+
|
| 89 |
+
%% Evaluation Pipeline
|
| 90 |
+
H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
|
| 91 |
+
K --> |"languages bcp_47"| N
|
| 92 |
+
L --> |"tasks.items"| N
|
| 93 |
+
N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model Γ Language Γ Task"]
|
| 94 |
+
O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
|
| 95 |
+
|
| 96 |
+
%% Task Execution with Origin Tracking
|
| 97 |
+
P --> Q1[translate_and_evaluate<br/>Origin: 'human']
|
| 98 |
+
P --> Q2[classify_and_evaluate<br/>Origin: 'human']
|
| 99 |
+
P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
|
| 100 |
+
P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
|
| 101 |
+
P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
|
| 102 |
+
P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
|
| 103 |
+
|
| 104 |
+
%% API Calls with Error Handling
|
| 105 |
+
Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
|
| 106 |
+
Q2 --> |"complete() API<br/>Rate Limiting"| R
|
| 107 |
+
Q3 --> |"complete() API<br/>Rate Limiting"| R
|
| 108 |
+
Q4 --> |"complete() API<br/>Rate Limiting"| R
|
| 109 |
+
Q5 --> |"complete() API<br/>Rate Limiting"| R
|
| 110 |
+
Q6 --> |"complete() API<br/>Rate Limiting"| R
|
| 111 |
+
|
| 112 |
+
%% Results Processing with Origin Aggregation
|
| 113 |
+
R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
|
| 114 |
+
S --> |Save| T[results.json]
|
| 115 |
+
|
| 116 |
+
%% Backend & Frontend with Origin-Specific Metrics
|
| 117 |
+
T --> |Read| U[backend.py]
|
| 118 |
+
I --> |Read| U
|
| 119 |
+
U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
|
| 120 |
+
U --> |make_country_table| W["Country Aggregation"]
|
| 121 |
+
U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
|
| 122 |
+
X --> |"JSON Response"| Y["Frontend React App"]
|
| 123 |
+
|
| 124 |
+
%% UI Components
|
| 125 |
+
Y --> Z1["WorldMap.js<br/>Country Visualization"]
|
| 126 |
+
Y --> Z2["ModelTable.js<br/>Model Rankings"]
|
| 127 |
+
Y --> Z3["LanguageTable.js<br/>Language Coverage"]
|
| 128 |
+
Y --> Z4["DatasetTable.js<br/>Task Performance"]
|
| 129 |
+
|
| 130 |
+
%% Data Sources with Origin Information
|
| 131 |
+
subgraph DS ["Data Sources"]
|
| 132 |
+
DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
|
| 133 |
+
DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
|
| 134 |
+
DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
|
| 135 |
+
DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
|
| 136 |
+
DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
|
| 137 |
+
end
|
| 138 |
+
|
| 139 |
+
DS1 --> Q1
|
| 140 |
+
DS2 --> Q3
|
| 141 |
+
DS3 --> Q4
|
| 142 |
+
DS4 --> Q5
|
| 143 |
+
DS5 --> Q6
|
| 144 |
+
|
| 145 |
+
DS_translated --> Q3
|
| 146 |
+
DS_translated --> Q4
|
| 147 |
+
DS_translated --> Q5
|
| 148 |
+
|
| 149 |
+
DS_native --> Q3
|
| 150 |
+
DS_native --> Q4
|
| 151 |
+
DS_native --> Q5
|
| 152 |
+
|
| 153 |
+
%% Styling - Neutral colors that work in both dark and light modes
|
| 154 |
+
classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
|
| 155 |
+
classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
|
| 156 |
+
classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
|
| 157 |
+
classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
|
| 158 |
+
classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
|
| 159 |
+
classDef translation fill:#d4edda,stroke:#155724,color:#155724
|
| 160 |
+
|
| 161 |
+
class A1,A2,A3,A4 modelSource
|
| 162 |
+
class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
|
| 163 |
+
class R,F,G,X api
|
| 164 |
+
class T,I storage
|
| 165 |
+
class Y,Z1,Z2,Z3,Z4 frontend
|
| 166 |
+
class Google_Translate,DS_translated,DS_native translation
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
**Key Features:**
|
| 170 |
+
- **Model Discovery**: Combines curated models with real-time trending models via web scraping
|
| 171 |
+
- **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
|
| 172 |
+
- **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
|
| 173 |
+
- **Real-time Visualization**: Interactive web interface with country-level insights
|
| 174 |
+
|
| 175 |
## Evaluate
|
| 176 |
|
| 177 |
+
### Local Development
|
| 178 |
```bash
|
| 179 |
uv run --extra dev evals/main.py
|
| 180 |
```
|
| 181 |
|
| 182 |
+
### Google Cloud Deployment
|
| 183 |
+
```bash
|
| 184 |
+
uv run --extra dev evals/main_gcs.py
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
## Explore
|
| 188 |
|
| 189 |
```bash
|
cloudbuild.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
steps:
|
| 2 |
+
- name: 'gcr.io/cloud-builders/docker'
|
| 3 |
+
args: ['build', '-f', 'Dockerfile.eval', '-t', 'gcr.io/$PROJECT_ID/ai-language-eval', '.']
|
| 4 |
+
images:
|
| 5 |
+
- 'gcr.io/$PROJECT_ID/ai-language-eval'
|
deploy_eval.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
echo "Deploying AI Language Evaluation to Google Cloud Run"
|
| 4 |
+
echo "Cost limit: $20 USD"
|
| 5 |
+
echo "No runtime limit - will run to completion"
|
| 6 |
+
|
| 7 |
+
# Build the Docker image first
|
| 8 |
+
echo "π¨ Building Docker image..."
|
| 9 |
+
gcloud builds submit --config cloudbuild.yaml .
|
| 10 |
+
|
| 11 |
+
# Deploy the built image
|
| 12 |
+
echo "π Deploying to Cloud Run..."
|
| 13 |
+
gcloud run deploy ai-language-eval \
|
| 14 |
+
--image gcr.io/ai-language-eval-1754052060/ai-language-eval \
|
| 15 |
+
--region us-central1 \
|
| 16 |
+
--platform managed \
|
| 17 |
+
--memory 2Gi \
|
| 18 |
+
--cpu 1 \
|
| 19 |
+
--max-instances 1 \
|
| 20 |
+
--timeout 3600 \
|
| 21 |
+
--concurrency 1 \
|
| 22 |
+
--no-allow-unauthenticated \
|
| 23 |
+
--set-env-vars="N_SENTENCES=20,MAX_LANGUAGES=150,COST_LIMIT_USD=20,PYTHONUNBUFFERED=1,PYTHONIOENCODING=utf-8" \
|
| 24 |
+
--quiet
|
| 25 |
+
|
| 26 |
+
echo "β
Deployment completed!"
|
| 27 |
+
echo "π Service URL: $(gcloud run services describe ai-language-eval --region=us-central1 --format='value(status.url)')"
|
| 28 |
+
echo "π Monitor costs: https://console.cloud.google.com/billing/linkedaccount?project=ai-language-eval-1754052060"
|
| 29 |
+
echo "πΎ Results will be saved to: gs://ai-language-eval-results/"
|
evals/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluation Framework Documentation
|
| 2 |
+
|
| 3 |
+
This document outlines the current methodology used for evaluating multilingual language models in this project. We may The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
|
| 4 |
+
|
| 5 |
+
## Core Philosophy: English Zero-Shot Prompting
|
| 6 |
+
|
| 7 |
+
The core of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
|
| 8 |
+
|
| 9 |
+
1. **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
|
| 10 |
+
2. **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.
|
| 11 |
+
3. **Zero-Shot (with a Twist)**: We do not provide in-context examples from the test datasets. However, for Question Answering tasks, we provide a static, English-based "scratchpad" example. This doesn't teach the model the answer, but rather the *format* for its reasoning and final output, which is crucial for reliable response parsing.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Task-Specific Prompting Strategies
|
| 16 |
+
|
| 17 |
+
Below is a breakdown of the prompt structure for each of the active evaluation tasks.
|
| 18 |
+
|
| 19 |
+
### 1. Translation (`translation`)
|
| 20 |
+
|
| 21 |
+
- **Objective**: To evaluate the model's ability to translate text both to and from a target language.
|
| 22 |
+
- **Prompt Structure**: A direct, zero-shot English instruction.
|
| 23 |
+
```
|
| 24 |
+
Translate the following text to the {target_language_name} language; use the {script} script; reply only with the translation:
|
| 25 |
+
|
| 26 |
+
{original_sentence}
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### 2. Classification (`classification`)
|
| 30 |
+
|
| 31 |
+
- **Objective**: To evaluate the model's ability to classify a paragraph of text into one of five topics.
|
| 32 |
+
- **Prompt Structure**: A direct, zero-shot English instruction providing the available topics.
|
| 33 |
+
```
|
| 34 |
+
Classify the following text into one of these topics: {topic1}, {topic2}, {topic3}, {topic4}, {topic5}.
|
| 35 |
+
Reply with only the topic name.
|
| 36 |
+
|
| 37 |
+
Text:
|
| 38 |
+
{paragraph_in_target_language}
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### 3. Question Answering (`mmlu`, `arc`, `truthfulqa`)
|
| 42 |
+
|
| 43 |
+
- **Objective**: To evaluate the model's knowledge and reasoning abilities on multiple-choice questions.
|
| 44 |
+
- **Prompt Structure**: A zero-shot English instruction combined with a "reasoning scratchpad" format.
|
| 45 |
+
```
|
| 46 |
+
Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
| 47 |
+
|
| 48 |
+
Response format: <reasoning> #### <letter>
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
{question_and_choices_in_target_language}
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### 4. Math Word Problems (`mgsm`)
|
| 56 |
+
|
| 57 |
+
- **Objective**: To evaluate the model's ability to solve mathematical reasoning problems.
|
| 58 |
+
- **Prompt Structure**: Similar to the QA tasks, this uses a zero-shot English instruction with a reasoning scratchpad, but asks for a number as the final answer.
|
| 59 |
+
```
|
| 60 |
+
Solve the following math problem. Reason step-by-step and then write the final answer as a number.
|
| 61 |
+
|
| 62 |
+
Response format: <reasoning> #### <number>
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
{math_problem_in_target_language}
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
## Advantages and Disadvantages of this Methodology
|
| 72 |
+
|
| 73 |
+
### Advantages
|
| 74 |
+
|
| 75 |
+
- **Fairness and Control**: By using standardized English prompts, we eliminate the quality of prompt translation as a confounding variable, leading to a fairer comparison between models.
|
| 76 |
+
- **Robustness**: This approach directly tests a model's cross-lingual instruction-following capabilities, which is a key measure of its multilingual prowess.
|
| 77 |
+
- **Simplicity and Maintainability**: The zero-shot approach significantly simplifies the codebase, making it easier to maintain and extend.
|
| 78 |
+
|
| 79 |
+
### Disadvantages
|
| 80 |
+
|
| 81 |
+
- **Brittleness of Response Parsing**: The evaluation of QA and Math tasks is highly dependent on the model's ability to perfectly adhere to the `#### <answer>` format. Models that produce correct reasoning but fail to follow the format will be unfairly penalized.
|
| 82 |
+
- **Potential for Cross-Lingual Confusion**: Less capable models may struggle with instructions in one language and content in another, which could impact their performance.
|
evals/backend.py
CHANGED
|
@@ -26,7 +26,7 @@ task_metrics = [
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
| 29 |
-
|
| 30 |
"mgsm_accuracy",
|
| 31 |
]
|
| 32 |
|
|
@@ -46,65 +46,73 @@ def compute_normalized_average(df, metrics):
|
|
| 46 |
|
| 47 |
|
| 48 |
def make_model_table(df, models):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
)
|
|
|
|
|
|
|
| 54 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
for metric in task_metrics:
|
| 58 |
if metric not in df.columns:
|
| 59 |
df[metric] = np.nan
|
|
|
|
| 60 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 63 |
df["rank"] = df.index + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
df = df[
|
| 65 |
[
|
| 66 |
-
"rank",
|
| 67 |
-
"
|
| 68 |
-
|
| 69 |
-
"provider_name",
|
| 70 |
-
"hf_id",
|
| 71 |
-
"creation_date",
|
| 72 |
-
"size",
|
| 73 |
-
"type",
|
| 74 |
-
"license",
|
| 75 |
-
"cost",
|
| 76 |
-
"average",
|
| 77 |
-
*task_metrics,
|
| 78 |
]
|
| 79 |
]
|
| 80 |
return df
|
| 81 |
|
| 82 |
|
| 83 |
def make_language_table(df, languages):
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
)
|
|
|
|
|
|
|
| 89 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
| 92 |
for metric in task_metrics:
|
| 93 |
if metric not in df.columns:
|
| 94 |
df[metric] = np.nan
|
|
|
|
| 95 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 97 |
df = df.sort_values(by="speakers", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
df = df[
|
| 99 |
[
|
| 100 |
-
"bcp_47",
|
| 101 |
-
"
|
| 102 |
-
|
| 103 |
-
"speakers",
|
| 104 |
-
"family",
|
| 105 |
-
"average",
|
| 106 |
-
"in_benchmark",
|
| 107 |
-
*task_metrics,
|
| 108 |
]
|
| 109 |
]
|
| 110 |
return df
|
|
@@ -125,10 +133,18 @@ async def data(request: Request):
|
|
| 125 |
body = await request.body()
|
| 126 |
data = json.loads(body)
|
| 127 |
selected_languages = data.get("selectedLanguages", {})
|
| 128 |
-
df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
|
| 129 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
| 130 |
language_table = make_language_table(df, languages)
|
| 131 |
datasets_df = pd.read_json("datasets.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
if selected_languages:
|
| 133 |
# the filtering is only applied for the model table and the country data
|
| 134 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
|
@@ -143,6 +159,7 @@ async def data(request: Request):
|
|
| 143 |
"language_table": serialize(language_table),
|
| 144 |
"dataset_table": serialize(datasets_df),
|
| 145 |
"countries": serialize(countries),
|
|
|
|
| 146 |
}
|
| 147 |
return JSONResponse(content=all_tables)
|
| 148 |
|
|
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
| 29 |
+
"truthfulqa_accuracy",
|
| 30 |
"mgsm_accuracy",
|
| 31 |
]
|
| 32 |
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def make_model_table(df, models):
|
| 49 |
+
# Create a combined task_metric for origin
|
| 50 |
+
df["task_metric_origin"] = df["task"] + "_" + df["metric"] + "_" + df["origin"]
|
| 51 |
+
|
| 52 |
+
# Pivot to get scores for each origin-specific metric
|
| 53 |
+
scores_pivot = df.pivot_table(index="model", columns="task_metric_origin", values="score", aggfunc="mean")
|
| 54 |
+
|
| 55 |
+
# Create the regular task_metric for the main average calculation
|
| 56 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 57 |
+
main_pivot = df.pivot_table(index="model", columns="task_metric", values="score", aggfunc="mean")
|
| 58 |
+
|
| 59 |
+
# Merge the two pivots
|
| 60 |
+
df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
|
| 61 |
+
|
| 62 |
for metric in task_metrics:
|
| 63 |
if metric not in df.columns:
|
| 64 |
df[metric] = np.nan
|
| 65 |
+
|
| 66 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 67 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 68 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 69 |
df["rank"] = df.index + 1
|
| 70 |
+
|
| 71 |
+
# Dynamically find all metric columns to include
|
| 72 |
+
final_cols = df.columns
|
| 73 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 74 |
+
|
| 75 |
df = df[
|
| 76 |
[
|
| 77 |
+
"rank", "model", "name", "provider_name", "hf_id", "creation_date",
|
| 78 |
+
"size", "type", "license", "cost", "average",
|
| 79 |
+
*sorted(list(set(metric_cols)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
]
|
| 81 |
]
|
| 82 |
return df
|
| 83 |
|
| 84 |
|
| 85 |
def make_language_table(df, languages):
|
| 86 |
+
# Create a combined task_metric for origin
|
| 87 |
+
df["task_metric_origin"] = df["task"] + "_" + df["metric"] + "_" + df["origin"]
|
| 88 |
+
|
| 89 |
+
# Pivot to get scores for each origin-specific metric
|
| 90 |
+
scores_pivot = df.pivot_table(index="bcp_47", columns="task_metric_origin", values="score", aggfunc="mean")
|
| 91 |
+
|
| 92 |
+
# Create the regular task_metric for the main average calculation
|
| 93 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 94 |
+
main_pivot = df.pivot_table(index="bcp_47", columns="task_metric", values="score", aggfunc="mean")
|
| 95 |
+
|
| 96 |
+
# Merge the two pivots
|
| 97 |
+
df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
|
| 98 |
+
|
| 99 |
for metric in task_metrics:
|
| 100 |
if metric not in df.columns:
|
| 101 |
df[metric] = np.nan
|
| 102 |
+
|
| 103 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 104 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 105 |
df = df.sort_values(by="speakers", ascending=False)
|
| 106 |
+
|
| 107 |
+
# Dynamically find all metric columns to include
|
| 108 |
+
final_cols = df.columns
|
| 109 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 110 |
+
|
| 111 |
df = df[
|
| 112 |
[
|
| 113 |
+
"bcp_47", "language_name", "autonym", "speakers", "family",
|
| 114 |
+
"average", "in_benchmark",
|
| 115 |
+
*sorted(list(set(metric_cols)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
]
|
| 117 |
]
|
| 118 |
return df
|
|
|
|
| 133 |
body = await request.body()
|
| 134 |
data = json.loads(body)
|
| 135 |
selected_languages = data.get("selectedLanguages", {})
|
| 136 |
+
df = scores.groupby(["model", "bcp_47", "task", "metric", "origin"]).mean().reset_index()
|
| 137 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
| 138 |
language_table = make_language_table(df, languages)
|
| 139 |
datasets_df = pd.read_json("datasets.json")
|
| 140 |
+
|
| 141 |
+
# Identify which metrics have machine translations available
|
| 142 |
+
machine_translated_metrics = set()
|
| 143 |
+
for _, row in df.iterrows():
|
| 144 |
+
if row["origin"] == "machine":
|
| 145 |
+
metric_name = f"{row['task']}_{row['metric']}"
|
| 146 |
+
machine_translated_metrics.add(metric_name)
|
| 147 |
+
|
| 148 |
if selected_languages:
|
| 149 |
# the filtering is only applied for the model table and the country data
|
| 150 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
|
|
|
| 159 |
"language_table": serialize(language_table),
|
| 160 |
"dataset_table": serialize(datasets_df),
|
| 161 |
"countries": serialize(countries),
|
| 162 |
+
"machine_translated_metrics": list(machine_translated_metrics),
|
| 163 |
}
|
| 164 |
return JSONResponse(content=all_tables)
|
| 165 |
|
evals/datasets_/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file makes datasets_ a Python package
|
evals/datasets_/arc.py
CHANGED
|
@@ -3,9 +3,9 @@ from collections import Counter, defaultdict
|
|
| 3 |
|
| 4 |
from langcodes import Language, standardize_tag
|
| 5 |
from rich import print
|
| 6 |
-
from models import translate_google,
|
| 7 |
from tqdm import tqdm
|
| 8 |
-
from datasets import
|
| 9 |
import asyncio
|
| 10 |
from tqdm.asyncio import tqdm_asyncio
|
| 11 |
import os
|
|
@@ -14,27 +14,33 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
|
|
| 14 |
|
| 15 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 16 |
tags_uhura_arc_easy = {
|
| 17 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
| 20 |
|
| 21 |
|
| 22 |
random.seed(42)
|
| 23 |
-
id_sets_train = [
|
|
|
|
|
|
|
|
|
|
| 24 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 25 |
random.shuffle(common_ids_train)
|
| 26 |
-
id_sets_test = [
|
|
|
|
|
|
|
|
|
|
| 27 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 28 |
random.shuffle(common_ids_test)
|
| 29 |
|
| 30 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 31 |
tags_uhura_arc_easy_translated = {
|
| 32 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
def add_choices(row):
|
| 39 |
row["choices"] = row["choices"]["text"]
|
| 40 |
return row
|
|
@@ -45,27 +51,37 @@ def load_uhura_arc_easy(language_bcp_47, nr):
|
|
| 45 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 46 |
ds = ds.map(add_choices)
|
| 47 |
ds = ds.rename_column("answerKey", "answer")
|
| 48 |
-
train_ids = common_ids_train[nr:nr+3]
|
| 49 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 50 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 51 |
-
return "masakhane/uhura-arc-easy",
|
| 52 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 53 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
|
|
|
| 54 |
ds = ds.rename_column("answerKey", "answer")
|
| 55 |
-
train_ids = common_ids_train[nr:nr+3]
|
| 56 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 57 |
-
# raise Exception(language_bcp_47)
|
| 58 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 59 |
-
return "fair-forward/arc-easy-autotranslated",
|
| 60 |
else:
|
|
|
|
| 61 |
return None, None, None
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def translate_arc(languages):
|
| 64 |
human_translated = tags_uhura_arc_easy.keys()
|
| 65 |
untranslated = [
|
| 66 |
lang
|
| 67 |
for lang in languages["bcp_47"].values[:100]
|
| 68 |
-
if lang not in human_translated and lang in
|
| 69 |
]
|
| 70 |
n_samples = 10
|
| 71 |
train_ids = common_ids_train[:n_samples+3]
|
|
|
|
| 3 |
|
| 4 |
from langcodes import Language, standardize_tag
|
| 5 |
from rich import print
|
| 6 |
+
from models import translate_google, get_google_supported_languages
|
| 7 |
from tqdm import tqdm
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
import asyncio
|
| 10 |
from tqdm.asyncio import tqdm_asyncio
|
| 11 |
import os
|
|
|
|
| 14 |
|
| 15 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 16 |
tags_uhura_arc_easy = {
|
| 17 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
| 18 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
| 19 |
if not a.endswith("unmatched")
|
| 20 |
}
|
| 21 |
|
| 22 |
|
| 23 |
random.seed(42)
|
| 24 |
+
id_sets_train = [
|
| 25 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
|
| 26 |
+
for tag in tags_uhura_arc_easy.values()
|
| 27 |
+
]
|
| 28 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 29 |
random.shuffle(common_ids_train)
|
| 30 |
+
id_sets_test = [
|
| 31 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
|
| 32 |
+
for tag in tags_uhura_arc_easy.values()
|
| 33 |
+
]
|
| 34 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 35 |
random.shuffle(common_ids_test)
|
| 36 |
|
| 37 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 38 |
tags_uhura_arc_easy_translated = {
|
| 39 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
| 40 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
| 41 |
}
|
| 42 |
|
| 43 |
|
|
|
|
|
|
|
| 44 |
def add_choices(row):
|
| 45 |
row["choices"] = row["choices"]["text"]
|
| 46 |
return row
|
|
|
|
| 51 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 52 |
ds = ds.map(add_choices)
|
| 53 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
| 54 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 55 |
+
return "masakhane/uhura-arc-easy", task, "human"
|
| 56 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 57 |
+
ds = _load_dataset(
|
| 58 |
+
slug_uhura_arc_easy_translated,
|
| 59 |
+
tags_uhura_arc_easy_translated[language_bcp_47],
|
| 60 |
+
)
|
| 61 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
|
|
|
| 62 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 63 |
+
return "fair-forward/arc-easy-autotranslated", task, "machine"
|
| 64 |
else:
|
| 65 |
+
# ARC does not support on-the-fly translation currently
|
| 66 |
return None, None, None
|
| 67 |
|
| 68 |
+
|
| 69 |
+
def load_uhura_arc_challenge(language_bcp_47, nr):
|
| 70 |
+
ds_name = "jlahd/uhura_arc_challenge"
|
| 71 |
+
if language_bcp_47 in _get_dataset_config_names(ds_name):
|
| 72 |
+
ds = _load_dataset(ds_name, language_bcp_47)
|
| 73 |
+
task = ds["test"][nr]
|
| 74 |
+
return ds_name, task
|
| 75 |
+
else:
|
| 76 |
+
return None, None, None
|
| 77 |
+
|
| 78 |
+
|
| 79 |
def translate_arc(languages):
|
| 80 |
human_translated = tags_uhura_arc_easy.keys()
|
| 81 |
untranslated = [
|
| 82 |
lang
|
| 83 |
for lang in languages["bcp_47"].values[:100]
|
| 84 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 85 |
]
|
| 86 |
n_samples = 10
|
| 87 |
train_ids = common_ids_train[:n_samples+3]
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
from datasets import Dataset, load_dataset
|
| 5 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 6 |
-
from langcodes import standardize_tag
|
| 7 |
-
from models import
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
|
|
@@ -38,19 +40,22 @@ def parse_number(i):
|
|
| 38 |
|
| 39 |
|
| 40 |
def load_mgsm(language_bcp_47, nr):
|
|
|
|
| 41 |
if language_bcp_47 in tags_mgsm.keys():
|
| 42 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
| 43 |
-
return slug_mgsm, ds[nr]
|
| 44 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 45 |
ds = _load_dataset(
|
| 46 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 47 |
)
|
| 48 |
-
return slug_afrimgsm, ds[nr]
|
| 49 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 50 |
ds = _load_dataset(
|
| 51 |
-
slug_gsm_autotranslated,
|
|
|
|
|
|
|
| 52 |
)
|
| 53 |
-
return slug_gsm_autotranslated, ds[nr]
|
| 54 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 55 |
row = _load_dataset(
|
| 56 |
slug_gsm8kx,
|
|
@@ -59,9 +64,9 @@ def load_mgsm(language_bcp_47, nr):
|
|
| 59 |
trust_remote_code=True,
|
| 60 |
)[nr]
|
| 61 |
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 62 |
-
return slug_gsm8kx, row
|
| 63 |
else:
|
| 64 |
-
return None, None
|
| 65 |
|
| 66 |
|
| 67 |
def translate_mgsm(languages):
|
|
@@ -69,7 +74,7 @@ def translate_mgsm(languages):
|
|
| 69 |
untranslated = [
|
| 70 |
lang
|
| 71 |
for lang in languages["bcp_47"].values[:100]
|
| 72 |
-
if lang not in human_translated and lang in
|
| 73 |
]
|
| 74 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 75 |
slug = "fair-forward/gsm-autotranslated"
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
| 3 |
+
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 7 |
+
from langcodes import Language, standardize_tag
|
| 8 |
+
from models import get_google_supported_languages, translate_google
|
| 9 |
+
from rich import print
|
| 10 |
from tqdm import tqdm
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
def load_mgsm(language_bcp_47, nr):
|
| 43 |
+
print(f"Loading MGSM data for {language_bcp_47}...")
|
| 44 |
if language_bcp_47 in tags_mgsm.keys():
|
| 45 |
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
| 46 |
+
return slug_mgsm, ds[nr], "human"
|
| 47 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 48 |
ds = _load_dataset(
|
| 49 |
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 50 |
)
|
| 51 |
+
return slug_afrimgsm, ds[nr], "human"
|
| 52 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 53 |
ds = _load_dataset(
|
| 54 |
+
slug_gsm_autotranslated,
|
| 55 |
+
subset=tags_gsm_autotranslated[language_bcp_47],
|
| 56 |
+
split="test",
|
| 57 |
)
|
| 58 |
+
return slug_gsm_autotranslated, ds[nr], "machine"
|
| 59 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 60 |
row = _load_dataset(
|
| 61 |
slug_gsm8kx,
|
|
|
|
| 64 |
trust_remote_code=True,
|
| 65 |
)[nr]
|
| 66 |
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 67 |
+
return slug_gsm8kx, row, "human" # Assuming Eurolingua is human-translated
|
| 68 |
else:
|
| 69 |
+
return None, None, None
|
| 70 |
|
| 71 |
|
| 72 |
def translate_mgsm(languages):
|
|
|
|
| 74 |
untranslated = [
|
| 75 |
lang
|
| 76 |
for lang in languages["bcp_47"].values[:100]
|
| 77 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 78 |
]
|
| 79 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 80 |
slug = "fair-forward/gsm-autotranslated"
|
evals/datasets_/mmlu.py
CHANGED
|
@@ -6,7 +6,7 @@ from collections import Counter, defaultdict
|
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
-
from models import
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
@@ -150,26 +150,66 @@ categories = sorted(
|
|
| 150 |
)
|
| 151 |
|
| 152 |
|
| 153 |
-
def load_mmlu(language_bcp_47, nr):
|
|
|
|
| 154 |
category = categories[nr % len(categories)]
|
| 155 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 156 |
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
| 157 |
ds = ds.map(parse_choices)
|
| 158 |
-
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 159 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 160 |
-
return "masakhane/afrimmlu",
|
| 161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 162 |
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
| 163 |
ds = ds.map(add_choices)
|
| 164 |
-
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 165 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 166 |
-
return "CohereForAI/Global-MMLU",
|
| 167 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 168 |
ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
|
| 169 |
-
examples = ds["dev"].filter(lambda x: x["subject"] == category)
|
| 170 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 171 |
-
return "fair-forward/mmlu-autotranslated",
|
| 172 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return None, None, None
|
| 174 |
|
| 175 |
|
|
@@ -178,7 +218,7 @@ def translate_mmlu(languages):
|
|
| 178 |
untranslated = [
|
| 179 |
lang
|
| 180 |
for lang in languages["bcp_47"].values[:100]
|
| 181 |
-
if lang not in human_translated and lang in
|
| 182 |
]
|
| 183 |
n_samples = 10
|
| 184 |
|
|
|
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
+
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
| 150 |
)
|
| 151 |
|
| 152 |
|
| 153 |
+
async def load_mmlu(language_bcp_47, nr):
|
| 154 |
+
print(f"Loading MMLU data for {language_bcp_47}...")
|
| 155 |
category = categories[nr % len(categories)]
|
| 156 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 157 |
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
| 158 |
ds = ds.map(parse_choices)
|
|
|
|
| 159 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 160 |
+
return "masakhane/afrimmlu", task, "human"
|
| 161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 162 |
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
| 163 |
ds = ds.map(add_choices)
|
|
|
|
| 164 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 165 |
+
return "CohereForAI/Global-MMLU", task, "human"
|
| 166 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 167 |
ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
|
|
|
|
| 168 |
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 169 |
+
return "fair-forward/mmlu-autotranslated", task, "machine"
|
| 170 |
else:
|
| 171 |
+
# Try on-the-fly translation for missing languages
|
| 172 |
+
return await load_mmlu_translated(language_bcp_47, nr)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
async def load_mmlu_translated(language_bcp_47, nr):
|
| 176 |
+
"""
|
| 177 |
+
Load MMLU data with on-the-fly Google translation for languages
|
| 178 |
+
without native MMLU translations.
|
| 179 |
+
"""
|
| 180 |
+
# Check if Google Translate supports this language
|
| 181 |
+
supported_languages = get_google_supported_languages()
|
| 182 |
+
if language_bcp_47 not in supported_languages:
|
| 183 |
+
return None, None, None
|
| 184 |
+
|
| 185 |
+
print(f"π Translating MMLU data to {language_bcp_47} on-the-fly...")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
# Load English MMLU data
|
| 189 |
+
category = categories[nr % len(categories)]
|
| 190 |
+
ds = _load_dataset("masakhane/afrimmlu", "eng")
|
| 191 |
+
ds = ds.map(parse_choices)
|
| 192 |
+
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 193 |
+
|
| 194 |
+
# Translate question and choices
|
| 195 |
+
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
| 196 |
+
choices_translated = []
|
| 197 |
+
for choice in task["choices"]:
|
| 198 |
+
choice_translated = await translate_google(choice, "en", language_bcp_47)
|
| 199 |
+
choices_translated.append(choice_translated)
|
| 200 |
+
|
| 201 |
+
# Create translated task
|
| 202 |
+
translated_task = {
|
| 203 |
+
"question": question_translated,
|
| 204 |
+
"choices": choices_translated,
|
| 205 |
+
"answer": task["answer"], # Keep original answer index
|
| 206 |
+
"subject": task["subject"]
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
return f"mmlu-translated-{language_bcp_47}", translated_task, "machine"
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print(f"β Translation failed for {language_bcp_47}: {e}")
|
| 213 |
return None, None, None
|
| 214 |
|
| 215 |
|
|
|
|
| 218 |
untranslated = [
|
| 219 |
lang
|
| 220 |
for lang in languages["bcp_47"].values[:100]
|
| 221 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 222 |
]
|
| 223 |
n_samples = 10
|
| 224 |
|
evals/datasets_/truthfulqa.py
CHANGED
|
@@ -9,7 +9,7 @@ from tqdm.asyncio import tqdm_asyncio
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
-
from models import translate_google,
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
|
@@ -26,14 +26,51 @@ def add_choices(row):
|
|
| 26 |
return row
|
| 27 |
|
| 28 |
|
| 29 |
-
def load_truthfulqa(language_bcp_47, nr):
|
| 30 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 31 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
| 32 |
ds = ds.map(add_choices)
|
| 33 |
-
examples = ds["train"]
|
| 34 |
task = ds["test"][nr]
|
| 35 |
-
return "masakhane/uhura-truthfulqa",
|
| 36 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return None, None, None
|
| 38 |
|
| 39 |
|
|
@@ -43,7 +80,7 @@ def translate_truthfulqa(languages):
|
|
| 43 |
untranslated = [
|
| 44 |
lang
|
| 45 |
for lang in languages["bcp_47"].values[:100]
|
| 46 |
-
if lang not in human_translated and lang in
|
| 47 |
]
|
| 48 |
n_samples = 10
|
| 49 |
|
|
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
+
from models import translate_google, get_google_supported_languages
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
|
|
|
| 26 |
return row
|
| 27 |
|
| 28 |
|
| 29 |
+
async def load_truthfulqa(language_bcp_47, nr):
|
| 30 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 31 |
+
ds = _load_dataset(
|
| 32 |
+
slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
|
| 33 |
+
)
|
| 34 |
ds = ds.map(add_choices)
|
|
|
|
| 35 |
task = ds["test"][nr]
|
| 36 |
+
return "masakhane/uhura-truthfulqa", task, "human"
|
| 37 |
else:
|
| 38 |
+
# Fallback to on-the-fly translation
|
| 39 |
+
return await load_truthfulqa_translated(language_bcp_47, nr)
|
| 40 |
+
|
| 41 |
+
async def load_truthfulqa_translated(language_bcp_47, nr):
|
| 42 |
+
"""
|
| 43 |
+
Load TruthfulQA data with on-the-fly Google translation.
|
| 44 |
+
"""
|
| 45 |
+
supported_languages = get_google_supported_languages()
|
| 46 |
+
if language_bcp_47 not in supported_languages:
|
| 47 |
+
return None, None, None
|
| 48 |
+
|
| 49 |
+
print(f"π Translating TruthfulQA data to {language_bcp_47} on-the-fly...")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
# Load English TruthfulQA data
|
| 53 |
+
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
|
| 54 |
+
ds = ds.map(add_choices)
|
| 55 |
+
task = ds["test"][nr]
|
| 56 |
+
|
| 57 |
+
# Translate question and choices
|
| 58 |
+
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
| 59 |
+
choices_translated = []
|
| 60 |
+
for choice in task["choices"]:
|
| 61 |
+
choice_translated = await translate_google(choice, "en", language_bcp_47)
|
| 62 |
+
choices_translated.append(choice_translated)
|
| 63 |
+
|
| 64 |
+
translated_task = {
|
| 65 |
+
"question": question_translated,
|
| 66 |
+
"choices": choices_translated,
|
| 67 |
+
"labels": task["labels"], # Keep original labels
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
return f"truthfulqa-translated-{language_bcp_47}", translated_task, "machine"
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"β Translation failed for {language_bcp_47}: {e}")
|
| 74 |
return None, None, None
|
| 75 |
|
| 76 |
|
|
|
|
| 80 |
untranslated = [
|
| 81 |
lang
|
| 82 |
for lang in languages["bcp_47"].values[:100]
|
| 83 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 84 |
]
|
| 85 |
n_samples = 10
|
| 86 |
|
evals/datasets_/util.py
CHANGED
|
@@ -12,3 +12,10 @@ def _get_dataset_config_names(dataset, **kwargs):
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
| 15 |
+
|
| 16 |
+
# Cache individual dataset items to avoid reloading entire datasets
|
| 17 |
+
@cache
|
| 18 |
+
def _get_dataset_item(dataset, subset, split, index, **kwargs):
|
| 19 |
+
"""Load a single item from a dataset efficiently"""
|
| 20 |
+
ds = load_dataset(dataset, subset, split=split, **kwargs)
|
| 21 |
+
return ds[index] if index < len(ds) else None
|
evals/main.py
CHANGED
|
@@ -1,62 +1,164 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
|
| 3 |
import pandas as pd
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
from models import models
|
| 6 |
from tasks import tasks
|
| 7 |
-
from
|
| 8 |
-
|
| 9 |
-
# ===== config =====
|
| 10 |
-
|
| 11 |
-
n_sentences = 10
|
| 12 |
-
|
| 13 |
-
# ===== run evaluation and aggregate results =====
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
async def evaluate():
|
| 17 |
# FIXME we should not need this for-loop, but it helps
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
print(f"running evaluations for {n_languages} languages")
|
| 20 |
old_results = pd.read_json("results.json")
|
|
|
|
|
|
|
| 21 |
old_models = pd.read_json("models.json")
|
| 22 |
# get all combinations of model, language and task
|
| 23 |
combis = [
|
| 24 |
(model, lang.bcp_47, task_name)
|
| 25 |
-
for model in
|
| 26 |
-
for lang in
|
| 27 |
for task_name, task in tasks.items()
|
| 28 |
-
if task_name in
|
| 29 |
]
|
| 30 |
# filter out combinations that have already been evaluated
|
| 31 |
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 32 |
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 33 |
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 34 |
-
# run evaluations
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
for
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 57 |
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 58 |
all_models.to_json("models.json", **args)
|
| 59 |
pd.DataFrame(languages).to_json("languages.json", **args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
import time
|
| 4 |
+
import os
|
| 5 |
+
from datetime import datetime, timedelta
|
| 6 |
+
from tqdm.asyncio import tqdm_asyncio
|
| 7 |
from models import models
|
| 8 |
from tasks import tasks
|
| 9 |
+
from languages import languages
|
| 10 |
+
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
results = pd.DataFrame()
|
| 13 |
|
| 14 |
async def evaluate():
|
| 15 |
# FIXME we should not need this for-loop, but it helps
|
| 16 |
+
n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
|
| 17 |
+
|
| 18 |
+
# Load models and languages
|
| 19 |
+
models_df = pd.DataFrame(models)
|
| 20 |
+
languages_df = pd.DataFrame(languages)
|
| 21 |
+
|
| 22 |
+
print(f"π Running full evaluation with {len(models_df)} models.")
|
| 23 |
+
start_time = time.time()
|
| 24 |
+
print(f"π Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 25 |
+
print(f"π Evaluating {n_sentences} sentences per task")
|
| 26 |
+
|
| 27 |
+
# Evaluate top languages by speakers (configurable via MAX_LANGUAGES env var)
|
| 28 |
+
max_languages = int(os.environ.get("MAX_LANGUAGES", 2)) # Default 2 for quick testing
|
| 29 |
+
top_languages = languages.head(max_languages) # Top N by population
|
| 30 |
+
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
| 31 |
+
|
| 32 |
+
# For testing, just use all available languages up to max_languages
|
| 33 |
+
for n_languages in [min(max_languages, len(top_languages))]:
|
| 34 |
print(f"running evaluations for {n_languages} languages")
|
| 35 |
old_results = pd.read_json("results.json")
|
| 36 |
+
if old_results.empty:
|
| 37 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 38 |
old_models = pd.read_json("models.json")
|
| 39 |
# get all combinations of model, language and task
|
| 40 |
combis = [
|
| 41 |
(model, lang.bcp_47, task_name)
|
| 42 |
+
for model in models_df["id"]
|
| 43 |
+
for lang in top_languages.iloc[:n_languages].itertuples()
|
| 44 |
for task_name, task in tasks.items()
|
| 45 |
+
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
| 46 |
]
|
| 47 |
# filter out combinations that have already been evaluated
|
| 48 |
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 49 |
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 50 |
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 51 |
+
# run evaluations in batches to prevent HTTP pool exhaustion
|
| 52 |
+
all_tasks = []
|
| 53 |
+
for i in range(n_sentences):
|
| 54 |
+
for model, bcp_47, task_name in combis.itertuples(index=False):
|
| 55 |
+
# All tasks now use the same signature
|
| 56 |
+
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
| 57 |
+
|
| 58 |
+
print(f"β³ Processing {len(all_tasks)} evaluation tasks in batches...")
|
| 59 |
+
|
| 60 |
+
batch_size = 50 # Process 50 tasks at a time
|
| 61 |
+
all_results = []
|
| 62 |
+
|
| 63 |
+
for i in range(0, len(all_tasks), batch_size):
|
| 64 |
+
batch = all_tasks[i:i+batch_size]
|
| 65 |
+
print(f"π¦ Processing batch {i//batch_size + 1}/{(len(all_tasks) + batch_size - 1)//batch_size} ({len(batch)} tasks)")
|
| 66 |
+
|
| 67 |
+
# Show what's being evaluated in this batch
|
| 68 |
+
batch_summary = {}
|
| 69 |
+
for task_data in batch:
|
| 70 |
+
task_func, model, bcp_47, sentence_nr = task_data
|
| 71 |
+
# Extract task name from function - handle both partial functions and regular functions
|
| 72 |
+
if hasattr(task_func, 'func'):
|
| 73 |
+
task_name = task_func.func.__name__.replace('_and_evaluate', '')
|
| 74 |
+
else:
|
| 75 |
+
task_name = task_func.__name__.replace('_and_evaluate', '')
|
| 76 |
+
|
| 77 |
+
if task_name not in batch_summary:
|
| 78 |
+
batch_summary[task_name] = set()
|
| 79 |
+
batch_summary[task_name].add(bcp_47)
|
| 80 |
+
|
| 81 |
+
for task_name, languages_set in batch_summary.items():
|
| 82 |
+
lang_list = ', '.join(sorted(languages_set))
|
| 83 |
+
print(f" π {task_name}: {lang_list}")
|
| 84 |
+
|
| 85 |
+
batch_coroutines = []
|
| 86 |
+
for task_data in batch:
|
| 87 |
+
task_func, model, bcp_47, sentence_nr = task_data
|
| 88 |
+
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
| 89 |
+
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
| 90 |
+
all_results.extend(batch_results)
|
| 91 |
+
|
| 92 |
+
# Small delay between batches to avoid overwhelming the API
|
| 93 |
+
await asyncio.sleep(1)
|
| 94 |
+
|
| 95 |
+
results = all_results
|
| 96 |
+
# Filter out exceptions and flatten results
|
| 97 |
+
valid_results = []
|
| 98 |
+
exception_count = 0
|
| 99 |
+
for r in results:
|
| 100 |
+
if isinstance(r, Exception):
|
| 101 |
+
exception_count += 1
|
| 102 |
+
continue
|
| 103 |
+
if isinstance(r, list):
|
| 104 |
+
valid_results.extend(r)
|
| 105 |
+
else:
|
| 106 |
+
valid_results.append(r)
|
| 107 |
+
|
| 108 |
+
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
| 109 |
+
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
| 110 |
+
|
| 111 |
+
# Save partial results even if some failed
|
| 112 |
+
if valid_results:
|
| 113 |
+
results = valid_results
|
| 114 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 115 |
+
|
| 116 |
+
# Aggregate results like main branch
|
| 117 |
+
results_df = pd.DataFrame(results)
|
| 118 |
+
if len(results_df) > 0:
|
| 119 |
+
results_df = (
|
| 120 |
+
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 121 |
+
.agg({"score": "mean"})
|
| 122 |
+
.reset_index()
|
| 123 |
+
)
|
| 124 |
+
# Merge with old results
|
| 125 |
+
old_results = pd.read_json("results.json")
|
| 126 |
+
results_df = pd.concat([old_results, results_df])
|
| 127 |
+
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 128 |
+
results_df.to_json("results.json", **args)
|
| 129 |
+
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
| 130 |
+
else:
|
| 131 |
+
print("β οΈ No valid results to aggregate")
|
| 132 |
+
else:
|
| 133 |
+
print("β οΈ No valid results to save - all API calls failed")
|
| 134 |
+
|
| 135 |
+
# Save up-to-date info on models and languages (like main branch)
|
| 136 |
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 137 |
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 138 |
all_models.to_json("models.json", **args)
|
| 139 |
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 140 |
+
|
| 141 |
+
# Continue with next batch even if this one had errors
|
| 142 |
+
|
| 143 |
+
# Time estimation
|
| 144 |
+
elapsed = time.time() - start_time
|
| 145 |
+
elapsed_str = str(timedelta(seconds=int(elapsed)))
|
| 146 |
+
if n_languages < max_languages:
|
| 147 |
+
remaining_batches = (max_languages - n_languages) // 10
|
| 148 |
+
batch_count = max(1, n_languages // 10) # Avoid division by zero
|
| 149 |
+
estimated_remaining = elapsed * remaining_batches / batch_count
|
| 150 |
+
eta = datetime.now() + timedelta(seconds=estimated_remaining)
|
| 151 |
+
print(f"β±οΈ Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
|
| 152 |
+
else:
|
| 153 |
+
print(f"β
Full evaluation completed in {elapsed_str}")
|
| 154 |
+
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 155 |
+
|
| 156 |
+
# Save results locally
|
| 157 |
+
with open("results.json", "w") as f:
|
| 158 |
+
json.dump(results, f, indent=2)
|
| 159 |
+
print(f"πΎ Results saved to results.json")
|
| 160 |
+
|
| 161 |
+
return results
|
| 162 |
|
| 163 |
|
| 164 |
if __name__ == "__main__":
|
evals/main_gcs.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import time
|
| 4 |
+
import os
|
| 5 |
+
from datetime import datetime, timedelta
|
| 6 |
+
from tqdm.asyncio import tqdm_asyncio
|
| 7 |
+
from models import models
|
| 8 |
+
from tasks import tasks
|
| 9 |
+
from languages import languages
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
# Google Cloud Storage imports
|
| 13 |
+
try:
|
| 14 |
+
from google.cloud import storage
|
| 15 |
+
GCS_AVAILABLE = True
|
| 16 |
+
print("β
Google Cloud Storage available")
|
| 17 |
+
except ImportError:
|
| 18 |
+
GCS_AVAILABLE = False
|
| 19 |
+
print("β Google Cloud Storage not available - install with: pip install google-cloud-storage")
|
| 20 |
+
|
| 21 |
+
async def save_results_to_gcs(results, bucket_name="ai-language-eval-results"):
|
| 22 |
+
"""Save results to Google Cloud Storage"""
|
| 23 |
+
if not GCS_AVAILABLE:
|
| 24 |
+
print("β Google Cloud Storage not available")
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
storage_client = storage.Client()
|
| 29 |
+
bucket = storage_client.bucket(bucket_name)
|
| 30 |
+
|
| 31 |
+
# Create bucket if it doesn't exist
|
| 32 |
+
if not bucket.exists():
|
| 33 |
+
bucket = storage_client.create_bucket(bucket_name, location="us-central1")
|
| 34 |
+
print(f"π¦ Created bucket: {bucket_name}")
|
| 35 |
+
|
| 36 |
+
# Save results with timestamp
|
| 37 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 38 |
+
blob_name = f"results_{timestamp}.json"
|
| 39 |
+
blob = bucket.blob(blob_name)
|
| 40 |
+
|
| 41 |
+
# Convert results to JSON and upload
|
| 42 |
+
results_json = json.dumps(results, indent=2)
|
| 43 |
+
blob.upload_from_string(results_json, content_type='application/json')
|
| 44 |
+
|
| 45 |
+
print(f"πΎ Results saved to GCS: gs://{bucket_name}/{blob_name}")
|
| 46 |
+
print(f"π Download with: gsutil cp gs://{bucket_name}/{blob_name} ./")
|
| 47 |
+
|
| 48 |
+
# Also save latest results
|
| 49 |
+
latest_blob = bucket.blob("results_latest.json")
|
| 50 |
+
latest_blob.upload_from_string(results_json, content_type='application/json')
|
| 51 |
+
print(f"πΎ Latest results: gs://{bucket_name}/results_latest.json")
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"β Failed to save to GCS: {e}")
|
| 55 |
+
print("πΎ Results saved locally to results.json")
|
| 56 |
+
|
| 57 |
+
results = pd.DataFrame()
|
| 58 |
+
|
| 59 |
+
async def evaluate():
|
| 60 |
+
# FIXME we should not need this for-loop, but it helps
|
| 61 |
+
n_sentences = int(os.environ.get("N_SENTENCES", 1)) # Default 1 for quick testing
|
| 62 |
+
|
| 63 |
+
# Load models and languages
|
| 64 |
+
models_df = pd.DataFrame(models)
|
| 65 |
+
languages_df = pd.DataFrame(languages)
|
| 66 |
+
|
| 67 |
+
print(f"π Running full evaluation with {len(models_df)} models.")
|
| 68 |
+
start_time = time.time()
|
| 69 |
+
print(f"π Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 70 |
+
print(f"π Evaluating {n_sentences} sentences per task")
|
| 71 |
+
|
| 72 |
+
# Evaluate top languages by speakers (configurable via MAX_LANGUAGES env var)
|
| 73 |
+
max_languages = int(os.environ.get("MAX_LANGUAGES", 2)) # Default 2 for quick testing
|
| 74 |
+
top_languages = languages.head(max_languages) # Top N by population
|
| 75 |
+
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
| 76 |
+
|
| 77 |
+
# For testing, just use all available languages up to max_languages
|
| 78 |
+
for n_languages in [min(max_languages, len(top_languages))]:
|
| 79 |
+
print(f"running evaluations for {n_languages} languages")
|
| 80 |
+
old_results = pd.read_json("results.json")
|
| 81 |
+
if old_results.empty:
|
| 82 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 83 |
+
old_models = pd.read_json("models.json")
|
| 84 |
+
# get all combinations of model, language and task
|
| 85 |
+
combis = [
|
| 86 |
+
(model, lang.bcp_47, task_name)
|
| 87 |
+
for model in models_df["id"]
|
| 88 |
+
for lang in top_languages.iloc[:n_languages].itertuples()
|
| 89 |
+
for task_name, task in tasks.items()
|
| 90 |
+
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
| 91 |
+
]
|
| 92 |
+
# filter out combinations that have already been evaluated
|
| 93 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 94 |
+
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 95 |
+
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 96 |
+
# run evaluations in batches to prevent HTTP pool exhaustion
|
| 97 |
+
all_tasks = []
|
| 98 |
+
for i in range(n_sentences):
|
| 99 |
+
for model, bcp_47, task_name in combis.itertuples(index=False):
|
| 100 |
+
# All tasks now use the same signature
|
| 101 |
+
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
| 102 |
+
|
| 103 |
+
print(f"β³ Processing {len(all_tasks)} evaluation tasks in batches...")
|
| 104 |
+
|
| 105 |
+
batch_size = 50 # Process 50 tasks at a time
|
| 106 |
+
all_results = []
|
| 107 |
+
|
| 108 |
+
for i in range(0, len(all_tasks), batch_size):
|
| 109 |
+
batch = all_tasks[i:i+batch_size]
|
| 110 |
+
print(f"π¦ Processing batch {i//batch_size + 1}/{(len(all_tasks) + batch_size - 1)//batch_size} ({len(batch)} tasks)")
|
| 111 |
+
|
| 112 |
+
# Show what's being evaluated in this batch
|
| 113 |
+
batch_summary = {}
|
| 114 |
+
for task_data in batch:
|
| 115 |
+
task_func, model, bcp_47, sentence_nr = task_data
|
| 116 |
+
# Extract task name from function - handle both partial functions and regular functions
|
| 117 |
+
if hasattr(task_func, 'func'):
|
| 118 |
+
task_name = task_func.func.__name__.replace('_and_evaluate', '')
|
| 119 |
+
else:
|
| 120 |
+
task_name = task_func.__name__.replace('_and_evaluate', '')
|
| 121 |
+
|
| 122 |
+
if task_name not in batch_summary:
|
| 123 |
+
batch_summary[task_name] = set()
|
| 124 |
+
batch_summary[task_name].add(bcp_47)
|
| 125 |
+
|
| 126 |
+
for task_name, languages_set in batch_summary.items():
|
| 127 |
+
lang_list = ', '.join(sorted(languages_set))
|
| 128 |
+
print(f" π {task_name}: {lang_list}")
|
| 129 |
+
|
| 130 |
+
batch_coroutines = []
|
| 131 |
+
for task_data in batch:
|
| 132 |
+
task_func, model, bcp_47, sentence_nr = task_data
|
| 133 |
+
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
| 134 |
+
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
| 135 |
+
all_results.extend(batch_results)
|
| 136 |
+
|
| 137 |
+
# Small delay between batches to avoid overwhelming the API
|
| 138 |
+
await asyncio.sleep(1)
|
| 139 |
+
|
| 140 |
+
results = all_results
|
| 141 |
+
# Filter out exceptions and flatten results
|
| 142 |
+
valid_results = []
|
| 143 |
+
exception_count = 0
|
| 144 |
+
for r in results:
|
| 145 |
+
if isinstance(r, Exception):
|
| 146 |
+
exception_count += 1
|
| 147 |
+
continue
|
| 148 |
+
if isinstance(r, list):
|
| 149 |
+
valid_results.extend(r)
|
| 150 |
+
else:
|
| 151 |
+
valid_results.append(r)
|
| 152 |
+
|
| 153 |
+
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
| 154 |
+
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
| 155 |
+
|
| 156 |
+
# Save partial results even if some failed
|
| 157 |
+
if valid_results:
|
| 158 |
+
results = valid_results
|
| 159 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 160 |
+
|
| 161 |
+
# Aggregate results like main branch
|
| 162 |
+
results_df = pd.DataFrame(results)
|
| 163 |
+
if len(results_df) > 0:
|
| 164 |
+
results_df = (
|
| 165 |
+
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 166 |
+
.agg({"score": "mean"})
|
| 167 |
+
.reset_index()
|
| 168 |
+
)
|
| 169 |
+
# Merge with old results
|
| 170 |
+
old_results = pd.read_json("results.json")
|
| 171 |
+
results_df = pd.concat([old_results, results_df])
|
| 172 |
+
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 173 |
+
results_df.to_json("results.json", **args)
|
| 174 |
+
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
| 175 |
+
else:
|
| 176 |
+
print("β οΈ No valid results to aggregate")
|
| 177 |
+
else:
|
| 178 |
+
print("β οΈ No valid results to save - all API calls failed")
|
| 179 |
+
|
| 180 |
+
# Save up-to-date info on models and languages (like main branch)
|
| 181 |
+
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 182 |
+
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 183 |
+
all_models.to_json("models.json", **args)
|
| 184 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 185 |
+
|
| 186 |
+
# Continue with next batch even if this one had errors
|
| 187 |
+
|
| 188 |
+
# Time estimation
|
| 189 |
+
elapsed = time.time() - start_time
|
| 190 |
+
elapsed_str = str(timedelta(seconds=int(elapsed)))
|
| 191 |
+
if n_languages < max_languages:
|
| 192 |
+
remaining_batches = (max_languages - n_languages) // 10
|
| 193 |
+
batch_count = max(1, n_languages // 10) # Avoid division by zero
|
| 194 |
+
estimated_remaining = elapsed * remaining_batches / batch_count
|
| 195 |
+
eta = datetime.now() + timedelta(seconds=estimated_remaining)
|
| 196 |
+
print(f"β±οΈ Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
|
| 197 |
+
else:
|
| 198 |
+
print(f"β
Full evaluation completed in {elapsed_str}")
|
| 199 |
+
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 200 |
+
|
| 201 |
+
# Save results locally
|
| 202 |
+
with open("results.json", "w") as f:
|
| 203 |
+
json.dump(results, f, indent=2)
|
| 204 |
+
print(f"πΎ Results saved to results.json")
|
| 205 |
+
|
| 206 |
+
# Save to Google Cloud Storage
|
| 207 |
+
await save_results_to_gcs(results)
|
| 208 |
+
|
| 209 |
+
return results
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
if __name__ == "__main__":
|
| 213 |
+
results = asyncio.run(evaluate())
|
evals/models.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
from collections import defaultdict
|
|
@@ -211,26 +212,55 @@ google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
|
|
| 211 |
|
| 212 |
@cache
|
| 213 |
async def complete(**kwargs) -> str | None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
async with openrouter_rate_limit:
|
| 215 |
try:
|
| 216 |
-
response = await
|
|
|
|
|
|
|
|
|
|
| 217 |
except BadRequestError as e:
|
| 218 |
if "filtered" in e.message:
|
| 219 |
return None
|
| 220 |
raise e
|
|
|
|
|
|
|
|
|
|
| 221 |
if not response.choices:
|
| 222 |
raise Exception(response)
|
| 223 |
return response.choices[0].message.content.strip()
|
| 224 |
|
| 225 |
|
| 226 |
-
translate_client =
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
@cache
|
| 231 |
async def translate_google(text, source_language, target_language):
|
|
|
|
| 232 |
async with google_rate_limit:
|
| 233 |
-
response =
|
| 234 |
text, source_language=source_language, target_language=target_language
|
| 235 |
)
|
| 236 |
return response["translatedText"]
|
|
@@ -294,12 +324,14 @@ def get_hf_metadata(row):
|
|
| 294 |
return empty
|
| 295 |
try:
|
| 296 |
info = api.model_info(id)
|
| 297 |
-
license =
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
|
|
|
|
|
|
| 303 |
return {
|
| 304 |
"hf_id": info.id,
|
| 305 |
"creation_date": info.created_at,
|
|
@@ -329,8 +361,30 @@ def load_models(date: date):
|
|
| 329 |
+ get_current_popular_models(date.today())[:10]
|
| 330 |
)
|
| 331 |
popular_models = [m["slug"] for m in popular_models]
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 335 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 336 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
|
@@ -350,7 +404,8 @@ def load_models(date: date):
|
|
| 350 |
license=hf_metadata.str["license"],
|
| 351 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 352 |
)
|
| 353 |
-
#
|
|
|
|
| 354 |
models["tasks"] = [
|
| 355 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
| 356 |
] * len(models)
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
from collections import defaultdict
|
|
|
|
| 212 |
|
| 213 |
@cache
|
| 214 |
async def complete(**kwargs) -> str | None:
|
| 215 |
+
# Add longer timeout for slower, premium, or reasoning-focused models
|
| 216 |
+
model_id = kwargs.get('model', '')
|
| 217 |
+
slow_model_keywords = [
|
| 218 |
+
'claude-3.5', 'claude-3.7', 'claude-4', 'sonnet-4', # Claude
|
| 219 |
+
'gpt-4', 'o1', 'o3', # OpenAI
|
| 220 |
+
'gemini-2.5', 'gemini-pro', # Google
|
| 221 |
+
'llama-4', # Meta
|
| 222 |
+
'reasoning', 'thinking' # General
|
| 223 |
+
]
|
| 224 |
+
timeout = 120 if any(keyword in model_id for keyword in slow_model_keywords) else 60
|
| 225 |
+
|
| 226 |
async with openrouter_rate_limit:
|
| 227 |
try:
|
| 228 |
+
response = await asyncio.wait_for(
|
| 229 |
+
client.chat.completions.create(**kwargs),
|
| 230 |
+
timeout=timeout
|
| 231 |
+
)
|
| 232 |
except BadRequestError as e:
|
| 233 |
if "filtered" in e.message:
|
| 234 |
return None
|
| 235 |
raise e
|
| 236 |
+
except asyncio.TimeoutError:
|
| 237 |
+
print(f"β° Timeout after {timeout}s for model {model}")
|
| 238 |
+
return None
|
| 239 |
if not response.choices:
|
| 240 |
raise Exception(response)
|
| 241 |
return response.choices[0].message.content.strip()
|
| 242 |
|
| 243 |
|
| 244 |
+
translate_client = None
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def get_google_translate_client():
|
| 248 |
+
global translate_client
|
| 249 |
+
if translate_client is None:
|
| 250 |
+
translate_client = translate.Client()
|
| 251 |
+
return translate_client
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def get_google_supported_languages():
|
| 255 |
+
client = get_google_translate_client()
|
| 256 |
+
return [l["language"] for l in client.get_languages()]
|
| 257 |
|
| 258 |
|
| 259 |
@cache
|
| 260 |
async def translate_google(text, source_language, target_language):
|
| 261 |
+
client = get_google_translate_client()
|
| 262 |
async with google_rate_limit:
|
| 263 |
+
response = client.translate(
|
| 264 |
text, source_language=source_language, target_language=target_language
|
| 265 |
)
|
| 266 |
return response["translatedText"]
|
|
|
|
| 324 |
return empty
|
| 325 |
try:
|
| 326 |
info = api.model_info(id)
|
| 327 |
+
license = ""
|
| 328 |
+
if info.card_data and hasattr(info.card_data, 'license') and info.card_data.license:
|
| 329 |
+
license = (
|
| 330 |
+
info.card_data.license
|
| 331 |
+
.replace("-", " ")
|
| 332 |
+
.replace("mit", "MIT")
|
| 333 |
+
.title()
|
| 334 |
+
)
|
| 335 |
return {
|
| 336 |
"hf_id": info.id,
|
| 337 |
"creation_date": info.created_at,
|
|
|
|
| 361 |
+ get_current_popular_models(date.today())[:10]
|
| 362 |
)
|
| 363 |
popular_models = [m["slug"] for m in popular_models]
|
| 364 |
+
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
| 365 |
+
|
| 366 |
+
# Validate models exist on OpenRouter before including them
|
| 367 |
+
print(f"π Validating {len(all_model_candidates)} model candidates...")
|
| 368 |
+
valid_models = []
|
| 369 |
+
invalid_models = []
|
| 370 |
+
|
| 371 |
+
for model_id in all_model_candidates:
|
| 372 |
+
metadata = get_or_metadata(model_id)
|
| 373 |
+
if metadata is not None:
|
| 374 |
+
valid_models.append(model_id)
|
| 375 |
+
else:
|
| 376 |
+
invalid_models.append(model_id)
|
| 377 |
+
|
| 378 |
+
if invalid_models:
|
| 379 |
+
print(f"β οΈ Excluded {len(invalid_models)} invalid models:")
|
| 380 |
+
for model in sorted(invalid_models)[:5]: # Show first 5
|
| 381 |
+
print(f" - {model}")
|
| 382 |
+
if len(invalid_models) > 5:
|
| 383 |
+
print(f" ... and {len(invalid_models) - 5} more")
|
| 384 |
+
|
| 385 |
+
print(f"β
Using {len(valid_models)} valid models for evaluation")
|
| 386 |
+
|
| 387 |
+
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
| 388 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 389 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 390 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
|
|
|
| 404 |
license=hf_metadata.str["license"],
|
| 405 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 406 |
)
|
| 407 |
+
# Filter out expensive models to keep costs reasonable
|
| 408 |
+
models = models[models["cost"] <= 20.0].reset_index(drop=True)
|
| 409 |
models["tasks"] = [
|
| 410 |
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
| 411 |
] * len(models)
|
evals/tasks.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
| 3 |
from textwrap import dedent
|
|
@@ -13,7 +14,7 @@ from datasets_.truthfulqa import load_truthfulqa
|
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
-
from models import complete, transcribe, translate_google
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
@@ -27,9 +28,6 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
| 27 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 28 |
)
|
| 29 |
|
| 30 |
-
translate_client = translate.Client()
|
| 31 |
-
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 32 |
-
|
| 33 |
|
| 34 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 35 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
@@ -48,6 +46,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 48 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 49 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 50 |
if model == "google/translate-v2":
|
|
|
|
| 51 |
original_language = closest_supported_match(
|
| 52 |
original_language, supported_languages
|
| 53 |
)
|
|
@@ -91,6 +90,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 91 |
"task": f"translation_{mode}",
|
| 92 |
"metric": metric,
|
| 93 |
"score": score,
|
|
|
|
| 94 |
"sentence_nr": sentence_nr,
|
| 95 |
}
|
| 96 |
for metric, score in (
|
|
@@ -112,38 +112,21 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 112 |
)
|
| 113 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 114 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 115 |
-
|
| 116 |
-
[
|
| 117 |
-
paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
|
| 118 |
-
for t in top_topics
|
| 119 |
-
]
|
| 120 |
-
).sample(frac=1, random_state=nr)
|
| 121 |
-
test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
|
| 122 |
-
frac=1, random_state=42
|
| 123 |
-
)
|
| 124 |
-
test_paragraph = test_paragraphs.iloc[nr]
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
messages = []
|
| 130 |
-
for example in examples.itertuples():
|
| 131 |
-
messages += [
|
| 132 |
-
{"role": "user", "content": format_prompt(example.text)},
|
| 133 |
-
{"role": "assistant", "content": example.topic},
|
| 134 |
-
]
|
| 135 |
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 136 |
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 137 |
try:
|
| 138 |
pred = await complete(
|
| 139 |
model=model,
|
| 140 |
-
messages=[
|
| 141 |
-
*messages,
|
| 142 |
-
{
|
| 143 |
-
"role": "user",
|
| 144 |
-
"content": format_prompt(test_paragraph.text),
|
| 145 |
-
},
|
| 146 |
-
],
|
| 147 |
temperature=0,
|
| 148 |
max_tokens=30,
|
| 149 |
)
|
|
@@ -170,6 +153,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 170 |
"task": "classification",
|
| 171 |
"metric": "accuracy",
|
| 172 |
"score": acc,
|
|
|
|
| 173 |
"sentence_nr": nr,
|
| 174 |
}
|
| 175 |
]
|
|
@@ -234,30 +218,36 @@ def format_multiple_choice(item):
|
|
| 234 |
C: {item["choices"][2]}
|
| 235 |
D: {item["choices"][3]}
|
| 236 |
|
| 237 |
-
|
| 238 |
|
| 239 |
|
| 240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 241 |
-
ds_name,
|
| 242 |
if not task:
|
| 243 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
try:
|
| 253 |
response = await complete(
|
| 254 |
model=model,
|
| 255 |
messages=messages,
|
| 256 |
temperature=0,
|
| 257 |
-
max_tokens=
|
| 258 |
)
|
| 259 |
-
if response:
|
| 260 |
-
|
|
|
|
| 261 |
else:
|
| 262 |
acc = 0
|
| 263 |
except Exception as e:
|
|
@@ -265,6 +255,7 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 265 |
acc = 0
|
| 266 |
else:
|
| 267 |
raise e
|
|
|
|
| 268 |
return [
|
| 269 |
{
|
| 270 |
"model": model,
|
|
@@ -272,32 +263,39 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 272 |
"task": "mmlu",
|
| 273 |
"metric": "accuracy",
|
| 274 |
"score": acc,
|
|
|
|
| 275 |
"sentence_nr": nr,
|
| 276 |
}
|
| 277 |
]
|
| 278 |
|
| 279 |
|
| 280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 281 |
-
ds_name,
|
| 282 |
if not task:
|
| 283 |
return []
|
| 284 |
|
| 285 |
-
messages = [
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
try:
|
| 293 |
response = await complete(
|
| 294 |
model=model,
|
| 295 |
messages=messages,
|
| 296 |
temperature=0,
|
| 297 |
-
max_tokens=
|
| 298 |
)
|
| 299 |
-
if response:
|
| 300 |
-
|
|
|
|
| 301 |
else:
|
| 302 |
acc = 0
|
| 303 |
except Exception as e:
|
|
@@ -312,6 +310,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 312 |
"task": "arc",
|
| 313 |
"metric": "accuracy",
|
| 314 |
"score": acc,
|
|
|
|
| 315 |
"sentence_nr": nr,
|
| 316 |
}
|
| 317 |
]
|
|
@@ -337,28 +336,40 @@ def format_multiple_choice_truthfulqa(item):
|
|
| 337 |
|
| 338 |
|
| 339 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 340 |
-
ds_name,
|
| 341 |
if not task:
|
| 342 |
return []
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
try:
|
| 354 |
response = await complete(
|
| 355 |
model=model,
|
| 356 |
messages=messages,
|
| 357 |
temperature=0,
|
| 358 |
-
max_tokens=
|
| 359 |
)
|
| 360 |
-
if response:
|
| 361 |
-
|
|
|
|
| 362 |
else:
|
| 363 |
acc = 0
|
| 364 |
except Exception as e:
|
|
@@ -373,30 +384,36 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
|
| 373 |
"task": "truthfulqa",
|
| 374 |
"metric": "accuracy",
|
| 375 |
"score": acc,
|
|
|
|
| 376 |
"sentence_nr": nr,
|
| 377 |
}
|
| 378 |
]
|
| 379 |
|
| 380 |
|
| 381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 382 |
-
|
| 383 |
-
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
| 384 |
-
Response format: <reasoning> #### <number>
|
| 385 |
-
"""
|
| 386 |
-
system_prompt = dedent(system_prompt).strip()
|
| 387 |
-
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
| 388 |
if not question:
|
| 389 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
response = await complete(
|
| 391 |
model=model,
|
| 392 |
-
messages=
|
| 393 |
-
{"role": "system", "content": system_prompt},
|
| 394 |
-
{"role": "user", "content": question["question"]},
|
| 395 |
-
],
|
| 396 |
temperature=0,
|
| 397 |
max_tokens=1024,
|
| 398 |
)
|
| 399 |
-
if response and
|
| 400 |
number = response.split("####")[1].strip()
|
| 401 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 402 |
else:
|
|
@@ -409,6 +426,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
| 409 |
"task": "mgsm",
|
| 410 |
"metric": "accuracy",
|
| 411 |
"score": accuracy,
|
|
|
|
| 412 |
"sentence_nr": nr,
|
| 413 |
}
|
| 414 |
]
|
|
@@ -449,10 +467,8 @@ tasks = {
|
|
| 449 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 450 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 451 |
"classification": classify_and_evaluate,
|
| 452 |
-
# "mlm": mlm_and_evaluate,
|
| 453 |
"mmlu": mmlu_and_evaluate,
|
| 454 |
"arc": arc_and_evaluate,
|
| 455 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 456 |
"mgsm": mgsm_and_evaluate,
|
| 457 |
-
# "asr": transcribe_and_evaluate,
|
| 458 |
}
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
import random
|
| 3 |
from functools import partial
|
| 4 |
from textwrap import dedent
|
|
|
|
| 14 |
from google.cloud import translate_v2 as translate
|
| 15 |
from langcodes import closest_supported_match
|
| 16 |
from languages import languages, script_name
|
| 17 |
+
from models import complete, transcribe, translate_google, get_google_supported_languages
|
| 18 |
|
| 19 |
bleu = evaluate.load("bleu")
|
| 20 |
chrf = evaluate.load("chrf")
|
|
|
|
| 28 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 29 |
)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 33 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
| 46 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 47 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 48 |
if model == "google/translate-v2":
|
| 49 |
+
supported_languages = get_google_supported_languages()
|
| 50 |
original_language = closest_supported_match(
|
| 51 |
original_language, supported_languages
|
| 52 |
)
|
|
|
|
| 90 |
"task": f"translation_{mode}",
|
| 91 |
"metric": metric,
|
| 92 |
"score": score,
|
| 93 |
+
"origin": "human", # FLORES+ is human-translated
|
| 94 |
"sentence_nr": sentence_nr,
|
| 95 |
}
|
| 96 |
for metric, score in (
|
|
|
|
| 112 |
)
|
| 113 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 114 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 115 |
+
test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
prompt = f"""Classify the following text into one of these topics: {', '.join(top_topics)}.
|
| 118 |
+
Reply with only the topic name.
|
| 119 |
+
|
| 120 |
+
Text:
|
| 121 |
+
{test_paragraph.text}
|
| 122 |
+
"""
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 125 |
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 126 |
try:
|
| 127 |
pred = await complete(
|
| 128 |
model=model,
|
| 129 |
+
messages=[{"role": "user", "content": prompt}],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
temperature=0,
|
| 131 |
max_tokens=30,
|
| 132 |
)
|
|
|
|
| 153 |
"task": "classification",
|
| 154 |
"metric": "accuracy",
|
| 155 |
"score": acc,
|
| 156 |
+
"origin": "human", # FLORES+ is human-translated
|
| 157 |
"sentence_nr": nr,
|
| 158 |
}
|
| 159 |
]
|
|
|
|
| 218 |
C: {item["choices"][2]}
|
| 219 |
D: {item["choices"][3]}
|
| 220 |
|
| 221 |
+
Answer with the letter of the correct answer."""
|
| 222 |
|
| 223 |
|
| 224 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 225 |
+
ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
|
| 226 |
if not task:
|
| 227 |
return []
|
| 228 |
+
|
| 229 |
+
messages = [
|
| 230 |
+
{
|
| 231 |
+
"role": "user",
|
| 232 |
+
"content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
| 233 |
|
| 234 |
+
Response format: <reasoning> #### <letter>
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
{format_multiple_choice(task)}""",
|
| 239 |
+
},
|
| 240 |
+
]
|
| 241 |
try:
|
| 242 |
response = await complete(
|
| 243 |
model=model,
|
| 244 |
messages=messages,
|
| 245 |
temperature=0,
|
| 246 |
+
max_tokens=1024,
|
| 247 |
)
|
| 248 |
+
if response and "####" in response:
|
| 249 |
+
answer = response.split("####")[-1].strip()
|
| 250 |
+
acc = int(answer[:1] == task["answer"])
|
| 251 |
else:
|
| 252 |
acc = 0
|
| 253 |
except Exception as e:
|
|
|
|
| 255 |
acc = 0
|
| 256 |
else:
|
| 257 |
raise e
|
| 258 |
+
|
| 259 |
return [
|
| 260 |
{
|
| 261 |
"model": model,
|
|
|
|
| 263 |
"task": "mmlu",
|
| 264 |
"metric": "accuracy",
|
| 265 |
"score": acc,
|
| 266 |
+
"origin": origin, # Add origin tag to results
|
| 267 |
"sentence_nr": nr,
|
| 268 |
}
|
| 269 |
]
|
| 270 |
|
| 271 |
|
| 272 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 273 |
+
ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
|
| 274 |
if not task:
|
| 275 |
return []
|
| 276 |
|
| 277 |
+
messages = [
|
| 278 |
+
{
|
| 279 |
+
"role": "user",
|
| 280 |
+
"content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
| 281 |
+
|
| 282 |
+
Response format: <reasoning> #### <letter>
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
{format_multiple_choice(task)}""",
|
| 287 |
+
},
|
| 288 |
+
]
|
| 289 |
try:
|
| 290 |
response = await complete(
|
| 291 |
model=model,
|
| 292 |
messages=messages,
|
| 293 |
temperature=0,
|
| 294 |
+
max_tokens=1024,
|
| 295 |
)
|
| 296 |
+
if response and "####" in response:
|
| 297 |
+
answer = response.split("####")[-1].strip()
|
| 298 |
+
acc = int(answer[:1] == task["answer"])
|
| 299 |
else:
|
| 300 |
acc = 0
|
| 301 |
except Exception as e:
|
|
|
|
| 310 |
"task": "arc",
|
| 311 |
"metric": "accuracy",
|
| 312 |
"score": acc,
|
| 313 |
+
"origin": origin,
|
| 314 |
"sentence_nr": nr,
|
| 315 |
}
|
| 316 |
]
|
|
|
|
| 336 |
|
| 337 |
|
| 338 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 339 |
+
ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
|
| 340 |
if not task:
|
| 341 |
return []
|
| 342 |
+
|
| 343 |
+
# Find the correct answer
|
| 344 |
+
try:
|
| 345 |
+
correct_choice_index = task["labels"].index(1)
|
| 346 |
+
answer = letters[correct_choice_index]
|
| 347 |
+
except (ValueError, IndexError):
|
| 348 |
+
# Handle cases where there is no correct answer or labels are malformed
|
| 349 |
+
return []
|
| 350 |
+
|
| 351 |
+
messages = [
|
| 352 |
+
{
|
| 353 |
+
"role": "user",
|
| 354 |
+
"content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
| 355 |
+
|
| 356 |
+
Response format: <reasoning> #### <letter>
|
| 357 |
+
|
| 358 |
+
---
|
| 359 |
+
|
| 360 |
+
{format_multiple_choice_truthfulqa(task)}""",
|
| 361 |
+
},
|
| 362 |
+
]
|
| 363 |
try:
|
| 364 |
response = await complete(
|
| 365 |
model=model,
|
| 366 |
messages=messages,
|
| 367 |
temperature=0,
|
| 368 |
+
max_tokens=1024, # Increased for reasoning
|
| 369 |
)
|
| 370 |
+
if response and "####" in response:
|
| 371 |
+
pred_answer = response.split("####")[-1].strip()
|
| 372 |
+
acc = int(pred_answer[:1].upper() == answer)
|
| 373 |
else:
|
| 374 |
acc = 0
|
| 375 |
except Exception as e:
|
|
|
|
| 384 |
"task": "truthfulqa",
|
| 385 |
"metric": "accuracy",
|
| 386 |
"score": acc,
|
| 387 |
+
"origin": origin,
|
| 388 |
"sentence_nr": nr,
|
| 389 |
}
|
| 390 |
]
|
| 391 |
|
| 392 |
|
| 393 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 394 |
+
ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
if not question:
|
| 396 |
return []
|
| 397 |
+
|
| 398 |
+
messages = [
|
| 399 |
+
{
|
| 400 |
+
"role": "user",
|
| 401 |
+
"content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
|
| 402 |
+
|
| 403 |
+
Response format: <reasoning> #### <number>
|
| 404 |
+
|
| 405 |
+
---
|
| 406 |
+
|
| 407 |
+
{question["question"]}""",
|
| 408 |
+
},
|
| 409 |
+
]
|
| 410 |
response = await complete(
|
| 411 |
model=model,
|
| 412 |
+
messages=messages,
|
|
|
|
|
|
|
|
|
|
| 413 |
temperature=0,
|
| 414 |
max_tokens=1024,
|
| 415 |
)
|
| 416 |
+
if response and "####" in response:
|
| 417 |
number = response.split("####")[1].strip()
|
| 418 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 419 |
else:
|
|
|
|
| 426 |
"task": "mgsm",
|
| 427 |
"metric": "accuracy",
|
| 428 |
"score": accuracy,
|
| 429 |
+
"origin": origin,
|
| 430 |
"sentence_nr": nr,
|
| 431 |
}
|
| 432 |
]
|
|
|
|
| 467 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 468 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 469 |
"classification": classify_and_evaluate,
|
|
|
|
| 470 |
"mmlu": mmlu_and_evaluate,
|
| 471 |
"arc": arc_and_evaluate,
|
| 472 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 473 |
"mgsm": mgsm_and_evaluate,
|
|
|
|
| 474 |
}
|
frontend/src/App.js
CHANGED
|
@@ -19,6 +19,7 @@ function App () {
|
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState(null)
|
| 21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
|
|
|
| 22 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 23 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 24 |
const [contributeVisible, setContributeVisible] = useState(false)
|
|
@@ -36,6 +37,7 @@ function App () {
|
|
| 36 |
})
|
| 37 |
.then(jsonData => {
|
| 38 |
setData(jsonData)
|
|
|
|
| 39 |
setLoading(false)
|
| 40 |
})
|
| 41 |
.catch(err => {
|
|
@@ -235,6 +237,7 @@ function App () {
|
|
| 235 |
data={data.model_table}
|
| 236 |
selectedLanguages={selectedLanguages}
|
| 237 |
allLanguages={data.language_table || []}
|
|
|
|
| 238 |
/>
|
| 239 |
<LanguageTable
|
| 240 |
data={data.language_table}
|
|
@@ -265,7 +268,7 @@ function App () {
|
|
| 265 |
/>
|
| 266 |
<Carousel
|
| 267 |
value={[
|
| 268 |
-
<WorldMap data={data.countries} />,
|
| 269 |
<LanguagePlot data={data} />,
|
| 270 |
<SpeakerPlot data={data} />,
|
| 271 |
<HistoryPlot data={data} />,
|
|
@@ -430,6 +433,7 @@ function App () {
|
|
| 430 |
value={[
|
| 431 |
<WorldMap
|
| 432 |
data={data.countries}
|
|
|
|
| 433 |
width={windowWidth * 0.7}
|
| 434 |
height={windowHeight * 0.6}
|
| 435 |
/>,
|
|
|
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState(null)
|
| 21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
| 22 |
+
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
| 23 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 24 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 25 |
const [contributeVisible, setContributeVisible] = useState(false)
|
|
|
|
| 37 |
})
|
| 38 |
.then(jsonData => {
|
| 39 |
setData(jsonData)
|
| 40 |
+
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
| 41 |
setLoading(false)
|
| 42 |
})
|
| 43 |
.catch(err => {
|
|
|
|
| 237 |
data={data.model_table}
|
| 238 |
selectedLanguages={selectedLanguages}
|
| 239 |
allLanguages={data.language_table || []}
|
| 240 |
+
machineTranslatedMetrics={machineTranslatedMetrics}
|
| 241 |
/>
|
| 242 |
<LanguageTable
|
| 243 |
data={data.language_table}
|
|
|
|
| 268 |
/>
|
| 269 |
<Carousel
|
| 270 |
value={[
|
| 271 |
+
<WorldMap data={data.countries} allLanguages={data.language_table} />,
|
| 272 |
<LanguagePlot data={data} />,
|
| 273 |
<SpeakerPlot data={data} />,
|
| 274 |
<HistoryPlot data={data} />,
|
|
|
|
| 433 |
value={[
|
| 434 |
<WorldMap
|
| 435 |
data={data.countries}
|
| 436 |
+
allLanguages={data.language_table}
|
| 437 |
width={windowWidth * 0.7}
|
| 438 |
height={windowHeight * 0.6}
|
| 439 |
/>,
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
-
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
@@ -155,17 +155,27 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 155 |
}
|
| 156 |
|
| 157 |
const getHeaderText = () => {
|
| 158 |
-
// Count languages that have evaluation data (
|
| 159 |
-
const evaluatedLanguagesCount = allLanguages.filter(lang =>
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
if (selectedLanguages.length === 0) {
|
| 164 |
return (
|
| 165 |
<span>
|
| 166 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 167 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 168 |
-
|
| 169 |
</span>
|
| 170 |
</span>
|
| 171 |
)
|
|
@@ -249,7 +259,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 249 |
body={costBodyTemplate}
|
| 250 |
style={{ minWidth: '5rem' }}
|
| 251 |
/>
|
| 252 |
-
{ScoreColumns}
|
| 253 |
</DataTable>
|
| 254 |
)
|
| 255 |
}
|
|
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
+
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
|
|
| 155 |
}
|
| 156 |
|
| 157 |
const getHeaderText = () => {
|
| 158 |
+
// Count languages that have any evaluation data (any task scores available)
|
| 159 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
| 160 |
+
// Check if language has any task scores (not just average)
|
| 161 |
+
const hasAnyScores = [
|
| 162 |
+
'translation_from_bleu',
|
| 163 |
+
'translation_to_bleu',
|
| 164 |
+
'classification_accuracy',
|
| 165 |
+
'mmlu_accuracy',
|
| 166 |
+
'arc_accuracy',
|
| 167 |
+
'truthfulqa_accuracy',
|
| 168 |
+
'mgsm_accuracy'
|
| 169 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 170 |
+
return hasAnyScores
|
| 171 |
+
}).length
|
| 172 |
|
| 173 |
if (selectedLanguages.length === 0) {
|
| 174 |
return (
|
| 175 |
<span>
|
| 176 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 177 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 178 |
+
Performance across {evaluatedLanguagesCount} evaluated languages
|
| 179 |
</span>
|
| 180 |
</span>
|
| 181 |
)
|
|
|
|
| 259 |
body={costBodyTemplate}
|
| 260 |
style={{ minWidth: '5rem' }}
|
| 261 |
/>
|
| 262 |
+
{ScoreColumns(machineTranslatedMetrics)}
|
| 263 |
</DataTable>
|
| 264 |
)
|
| 265 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -2,21 +2,22 @@ import { Column } from 'primereact/column'
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
-
const { minScore = 0, maxScore = 1 } = options
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
-
|
|
|
|
| 10 |
}
|
| 11 |
}
|
| 12 |
|
| 13 |
-
const ScoreColumns = [
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
header='Proficiency'
|
| 17 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 18 |
sortable
|
| 19 |
-
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 21 |
/>,
|
| 22 |
<Column
|
|
@@ -26,7 +27,8 @@ const ScoreColumns = [
|
|
| 26 |
sortable
|
| 27 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 28 |
minScore: 0,
|
| 29 |
-
maxScore: 0.5
|
|
|
|
| 30 |
})}
|
| 31 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 32 |
/>,
|
|
@@ -37,7 +39,8 @@ const ScoreColumns = [
|
|
| 37 |
sortable
|
| 38 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 39 |
minScore: 0,
|
| 40 |
-
maxScore: 0.5
|
|
|
|
| 41 |
})}
|
| 42 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 43 |
/>,
|
|
@@ -48,7 +51,8 @@ const ScoreColumns = [
|
|
| 48 |
sortable
|
| 49 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 50 |
minScore: 0,
|
| 51 |
-
maxScore: 0.5
|
|
|
|
| 52 |
})}
|
| 53 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 54 |
/>,
|
|
@@ -69,7 +73,8 @@ const ScoreColumns = [
|
|
| 69 |
sortable
|
| 70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 71 |
minScore: 0,
|
| 72 |
-
maxScore: 1
|
|
|
|
| 73 |
})}
|
| 74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 75 |
/>,
|
|
@@ -80,7 +85,8 @@ const ScoreColumns = [
|
|
| 80 |
sortable
|
| 81 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 82 |
minScore: 0,
|
| 83 |
-
maxScore: 1
|
|
|
|
| 84 |
})}
|
| 85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
/>,
|
|
@@ -91,7 +97,8 @@ const ScoreColumns = [
|
|
| 91 |
sortable
|
| 92 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 93 |
minScore: 0,
|
| 94 |
-
maxScore: 1
|
|
|
|
| 95 |
})}
|
| 96 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 97 |
/>,
|
|
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
+
const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
+
const isMachineTranslated = machineTranslatedMetrics.includes(field)
|
| 10 |
+
return ScoreField(score, minScore, maxScore, isMachineTranslated)
|
| 11 |
}
|
| 12 |
}
|
| 13 |
|
| 14 |
+
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
| 15 |
<Column
|
| 16 |
field='average'
|
| 17 |
header='Proficiency'
|
| 18 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 19 |
sortable
|
| 20 |
+
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
|
| 21 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 22 |
/>,
|
| 23 |
<Column
|
|
|
|
| 27 |
sortable
|
| 28 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 29 |
minScore: 0,
|
| 30 |
+
maxScore: 0.5,
|
| 31 |
+
machineTranslatedMetrics
|
| 32 |
})}
|
| 33 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 34 |
/>,
|
|
|
|
| 39 |
sortable
|
| 40 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 41 |
minScore: 0,
|
| 42 |
+
maxScore: 0.5,
|
| 43 |
+
machineTranslatedMetrics
|
| 44 |
})}
|
| 45 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 46 |
/>,
|
|
|
|
| 51 |
sortable
|
| 52 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 53 |
minScore: 0,
|
| 54 |
+
maxScore: 0.5,
|
| 55 |
+
machineTranslatedMetrics
|
| 56 |
})}
|
| 57 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 58 |
/>,
|
|
|
|
| 73 |
sortable
|
| 74 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 75 |
minScore: 0,
|
| 76 |
+
maxScore: 1,
|
| 77 |
+
machineTranslatedMetrics
|
| 78 |
})}
|
| 79 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 80 |
/>,
|
|
|
|
| 85 |
sortable
|
| 86 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 87 |
minScore: 0,
|
| 88 |
+
maxScore: 1,
|
| 89 |
+
machineTranslatedMetrics
|
| 90 |
})}
|
| 91 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 92 |
/>,
|
|
|
|
| 97 |
sortable
|
| 98 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 99 |
minScore: 0,
|
| 100 |
+
maxScore: 1,
|
| 101 |
+
machineTranslatedMetrics
|
| 102 |
})}
|
| 103 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 104 |
/>,
|
frontend/src/components/ScoreField.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
const ScoreField = (score, minScore, maxScore) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
@@ -50,6 +50,7 @@ const ScoreField = (score, minScore, maxScore) => {
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : 'β'}
|
|
|
|
| 53 |
</span>
|
| 54 |
</div>
|
| 55 |
)
|
|
|
|
| 1 |
+
const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : 'β'}
|
| 53 |
+
{isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
|
| 54 |
</span>
|
| 55 |
</div>
|
| 56 |
)
|
frontend/src/components/WorldMap.js
CHANGED
|
@@ -32,7 +32,7 @@ const makeTitle = data => d => {
|
|
| 32 |
return `${d.properties.ADMIN} β ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
|
| 33 |
}
|
| 34 |
|
| 35 |
-
const WorldMap = ({ data, width = 750, height = 500 }) => {
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
@@ -48,8 +48,22 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
const plot = Plot.plot({
|
| 52 |
-
subtitle:
|
| 53 |
width: width,
|
| 54 |
height: height,
|
| 55 |
projection: 'equal-earth',
|
|
|
|
| 32 |
return `${d.properties.ADMIN} β ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
|
| 33 |
}
|
| 34 |
|
| 35 |
+
const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
| 51 |
+
// Count languages that have any evaluation data
|
| 52 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
| 53 |
+
const hasAnyScores = [
|
| 54 |
+
'translation_from_bleu',
|
| 55 |
+
'translation_to_bleu',
|
| 56 |
+
'classification_accuracy',
|
| 57 |
+
'mmlu_accuracy',
|
| 58 |
+
'arc_accuracy',
|
| 59 |
+
'truthfulqa_accuracy',
|
| 60 |
+
'mgsm_accuracy'
|
| 61 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 62 |
+
return hasAnyScores
|
| 63 |
+
}).length
|
| 64 |
+
|
| 65 |
const plot = Plot.plot({
|
| 66 |
+
subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
|
| 67 |
width: width,
|
| 68 |
height: height,
|
| 69 |
projection: 'equal-earth',
|
languages.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
-
"commonvoice_hours":
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
@@ -79,7 +79,7 @@
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
-
"commonvoice_hours":
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
@@ -151,7 +151,7 @@
|
|
| 151 |
"family":"Austronesian",
|
| 152 |
"flores_path":"ind_Latn",
|
| 153 |
"fleurs_tag":"id_id",
|
| 154 |
-
"commonvoice_hours":
|
| 155 |
"commonvoice_locale":"id",
|
| 156 |
"in_benchmark":true
|
| 157 |
},
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
-
"commonvoice_hours":
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
@@ -439,7 +439,7 @@
|
|
| 439 |
"family":"Indo-European",
|
| 440 |
"flores_path":"pol_Latn",
|
| 441 |
"fleurs_tag":"pl_pl",
|
| 442 |
-
"commonvoice_hours":
|
| 443 |
"commonvoice_locale":"pl",
|
| 444 |
"in_benchmark":true
|
| 445 |
},
|
|
@@ -619,7 +619,7 @@
|
|
| 619 |
"family":"Indo-European",
|
| 620 |
"flores_path":"nld_Latn",
|
| 621 |
"fleurs_tag":"nl_nl",
|
| 622 |
-
"commonvoice_hours":
|
| 623 |
"commonvoice_locale":"nl",
|
| 624 |
"in_benchmark":true
|
| 625 |
},
|
|
@@ -1291,7 +1291,7 @@
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
-
"commonvoice_hours":
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
@@ -1303,7 +1303,7 @@
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
-
"commonvoice_hours":1.
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
@@ -1375,7 +1375,7 @@
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
-
"commonvoice_hours":
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
@@ -1675,7 +1675,7 @@
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
-
"commonvoice_hours":4.
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
@@ -1747,7 +1747,7 @@
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
-
"commonvoice_hours":
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
@@ -2155,7 +2155,7 @@
|
|
| 2155 |
"family":"Kartvelian",
|
| 2156 |
"flores_path":"kat_Geor",
|
| 2157 |
"fleurs_tag":"ka_ge",
|
| 2158 |
-
"commonvoice_hours":
|
| 2159 |
"commonvoice_locale":"ka",
|
| 2160 |
"in_benchmark":true
|
| 2161 |
},
|
|
@@ -2167,7 +2167,7 @@
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
-
"commonvoice_hours":
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
@@ -3331,7 +3331,7 @@
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
-
"commonvoice_hours":
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
@@ -3559,7 +3559,7 @@
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
-
"commonvoice_hours":
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
@@ -3679,7 +3679,7 @@
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
-
"commonvoice_hours":
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
@@ -5011,7 +5011,7 @@
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
-
"commonvoice_hours":0.
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
|
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
+
"commonvoice_hours":2679.0,
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
+
"commonvoice_hours":1068.0,
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
|
|
| 151 |
"family":"Austronesian",
|
| 152 |
"flores_path":"ind_Latn",
|
| 153 |
"fleurs_tag":"id_id",
|
| 154 |
+
"commonvoice_hours":34.0,
|
| 155 |
"commonvoice_locale":"id",
|
| 156 |
"in_benchmark":true
|
| 157 |
},
|
|
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
+
"commonvoice_hours":1371.0,
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
|
|
| 439 |
"family":"Indo-European",
|
| 440 |
"flores_path":"pol_Latn",
|
| 441 |
"fleurs_tag":"pl_pl",
|
| 442 |
+
"commonvoice_hours":176.0,
|
| 443 |
"commonvoice_locale":"pl",
|
| 444 |
"in_benchmark":true
|
| 445 |
},
|
|
|
|
| 619 |
"family":"Indo-European",
|
| 620 |
"flores_path":"nld_Latn",
|
| 621 |
"fleurs_tag":"nl_nl",
|
| 622 |
+
"commonvoice_hours":123.0,
|
| 623 |
"commonvoice_locale":"nl",
|
| 624 |
"in_benchmark":true
|
| 625 |
},
|
|
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
+
"commonvoice_hours":2878.0,
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
+
"commonvoice_hours":1.7,
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
+
"commonvoice_hours":427.0,
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
+
"commonvoice_hours":4.1,
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
+
"commonvoice_hours":1.5,
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
|
|
| 2155 |
"family":"Kartvelian",
|
| 2156 |
"flores_path":"kat_Geor",
|
| 2157 |
"fleurs_tag":"ka_ge",
|
| 2158 |
+
"commonvoice_hours":167.0,
|
| 2159 |
"commonvoice_locale":"ka",
|
| 2160 |
"in_benchmark":true
|
| 2161 |
},
|
|
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
+
"commonvoice_hours":129.0,
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
+
"commonvoice_hours":9.1,
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
+
"commonvoice_hours":94.0,
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
+
"commonvoice_hours":1.4,
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
+
"commonvoice_hours":0.9,
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
models.json
CHANGED
|
@@ -1,4 +1,44 @@
|
|
| 1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
{
|
| 3 |
"id":"amazon\/nova-micro-v1",
|
| 4 |
"name":"Nova Micro 1.0",
|
|
@@ -19,6 +59,66 @@
|
|
| 19 |
"mgsm"
|
| 20 |
]
|
| 21 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
{
|
| 23 |
"id":"anthropic\/claude-3.5-sonnet",
|
| 24 |
"name":"Claude 3.5 Sonnet",
|
|
@@ -79,6 +179,106 @@
|
|
| 79 |
"mgsm"
|
| 80 |
]
|
| 81 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
{
|
| 83 |
"id":"deepseek\/deepseek-chat",
|
| 84 |
"name":"DeepSeek V3",
|
|
@@ -128,7 +328,7 @@
|
|
| 128 |
"size":684531386000.0,
|
| 129 |
"type":"open-source",
|
| 130 |
"license":"Mit",
|
| 131 |
-
"creation_date":1737331200000
|
| 132 |
"tasks":[
|
| 133 |
"translation_from",
|
| 134 |
"translation_to",
|
|
@@ -179,6 +379,26 @@
|
|
| 179 |
"mgsm"
|
| 180 |
]
|
| 181 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
{
|
| 183 |
"id":"google\/gemini-2.0-flash-lite-001",
|
| 184 |
"name":"Gemini 2.0 Flash Lite",
|
|
@@ -219,6 +439,26 @@
|
|
| 219 |
"mgsm"
|
| 220 |
]
|
| 221 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
{
|
| 223 |
"id":"google\/gemini-2.5-flash-lite-preview-06-17",
|
| 224 |
"name":"Gemini 2.5 Flash Lite Preview 06-17",
|
|
@@ -370,15 +610,15 @@
|
|
| 370 |
]
|
| 371 |
},
|
| 372 |
{
|
| 373 |
-
"id":"google\/
|
| 374 |
-
"name":"
|
| 375 |
"provider_name":"Google",
|
| 376 |
-
"cost":
|
| 377 |
-
"hf_id":
|
| 378 |
-
"size":
|
| 379 |
-
"type":"
|
| 380 |
-
"license":
|
| 381 |
-
"creation_date":
|
| 382 |
"tasks":[
|
| 383 |
"translation_from",
|
| 384 |
"translation_to",
|
|
@@ -390,30 +630,35 @@
|
|
| 390 |
]
|
| 391 |
},
|
| 392 |
{
|
| 393 |
-
"id":"google\/
|
| 394 |
-
"name":"
|
| 395 |
"provider_name":"Google",
|
| 396 |
-
"cost":
|
| 397 |
-
"hf_id":
|
| 398 |
-
"size":
|
| 399 |
-
"type":"
|
| 400 |
-
"license":
|
| 401 |
-
"creation_date":
|
| 402 |
"tasks":[
|
| 403 |
"translation_from",
|
| 404 |
-
"translation_to"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
]
|
| 406 |
},
|
| 407 |
{
|
| 408 |
-
"id":"
|
| 409 |
-
"name":"
|
| 410 |
-
"provider_name":"
|
| 411 |
-
"cost":0.
|
| 412 |
-
"hf_id":"
|
| 413 |
-
"size":
|
| 414 |
"type":"open-source",
|
| 415 |
-
"license":"
|
| 416 |
-
"creation_date":
|
| 417 |
"tasks":[
|
| 418 |
"translation_from",
|
| 419 |
"translation_to",
|
|
@@ -425,15 +670,15 @@
|
|
| 425 |
]
|
| 426 |
},
|
| 427 |
{
|
| 428 |
-
"id":"
|
| 429 |
-
"name":"
|
| 430 |
-
"provider_name":"
|
| 431 |
-
"cost":0.
|
| 432 |
-
"hf_id":"
|
| 433 |
-
"size":
|
| 434 |
"type":"open-source",
|
| 435 |
-
"license":"
|
| 436 |
-
"creation_date":
|
| 437 |
"tasks":[
|
| 438 |
"translation_from",
|
| 439 |
"translation_to",
|
|
@@ -445,15 +690,15 @@
|
|
| 445 |
]
|
| 446 |
},
|
| 447 |
{
|
| 448 |
-
"id":"
|
| 449 |
-
"name":"
|
| 450 |
-
"provider_name":"
|
| 451 |
-
"cost":0.
|
| 452 |
-
"hf_id":"
|
| 453 |
-
"size":
|
| 454 |
"type":"open-source",
|
| 455 |
-
"license":"
|
| 456 |
-
"creation_date":
|
| 457 |
"tasks":[
|
| 458 |
"translation_from",
|
| 459 |
"translation_to",
|
|
@@ -465,9 +710,164 @@
|
|
| 465 |
]
|
| 466 |
},
|
| 467 |
{
|
| 468 |
-
"id":"
|
| 469 |
-
"name":"
|
| 470 |
-
"provider_name":"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
"cost":0.0,
|
| 472 |
"hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
|
| 473 |
"size":8030261248.0,
|
|
@@ -476,6 +876,26 @@
|
|
| 476 |
"creation_date":1721260800000.0,
|
| 477 |
"tasks":null
|
| 478 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
{
|
| 480 |
"id":"meta-llama\/llama-3.2-1b-instruct",
|
| 481 |
"name":"Llama 3.2 1B Instruct",
|
|
@@ -488,6 +908,26 @@
|
|
| 488 |
"creation_date":1726617600000.0,
|
| 489 |
"tasks":null
|
| 490 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
{
|
| 492 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
| 493 |
"name":"Llama 3.3 70B Instruct",
|
|
@@ -529,15 +969,295 @@
|
|
| 529 |
]
|
| 530 |
},
|
| 531 |
{
|
| 532 |
-
"id":"
|
| 533 |
-
"name":"
|
| 534 |
-
"provider_name":"
|
| 535 |
-
"cost":0.
|
| 536 |
-
"hf_id":"
|
| 537 |
-
"size":
|
| 538 |
-
"type":"open-source",
|
| 539 |
-
"license":"
|
| 540 |
-
"creation_date":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
"tasks":[
|
| 542 |
"translation_from",
|
| 543 |
"translation_to",
|
|
@@ -549,15 +1269,15 @@
|
|
| 549 |
]
|
| 550 |
},
|
| 551 |
{
|
| 552 |
-
"id":"
|
| 553 |
-
"name":"
|
| 554 |
-
"provider_name":"
|
| 555 |
-
"cost":0.
|
| 556 |
-
"hf_id":"
|
| 557 |
-
"size":
|
| 558 |
"type":"open-source",
|
| 559 |
-
"license":"
|
| 560 |
-
"creation_date":
|
| 561 |
"tasks":[
|
| 562 |
"translation_from",
|
| 563 |
"translation_to",
|
|
@@ -569,15 +1289,15 @@
|
|
| 569 |
]
|
| 570 |
},
|
| 571 |
{
|
| 572 |
-
"id":"mistralai\/
|
| 573 |
-
"name":"
|
| 574 |
"provider_name":"Mistral",
|
| 575 |
-
"cost":0.
|
| 576 |
-
"hf_id":"mistralai\/
|
| 577 |
-
"size":
|
| 578 |
"type":"open-source",
|
| 579 |
"license":"Apache 2.0",
|
| 580 |
-
"creation_date":
|
| 581 |
"tasks":[
|
| 582 |
"translation_from",
|
| 583 |
"translation_to",
|
|
@@ -589,15 +1309,15 @@
|
|
| 589 |
]
|
| 590 |
},
|
| 591 |
{
|
| 592 |
-
"id":"
|
| 593 |
-
"name":"
|
| 594 |
-
"provider_name":"
|
| 595 |
-
"cost":0.
|
| 596 |
-
"hf_id":
|
| 597 |
"size":null,
|
| 598 |
-
"type":"
|
| 599 |
-
"license":
|
| 600 |
-
"creation_date":
|
| 601 |
"tasks":[
|
| 602 |
"translation_from",
|
| 603 |
"translation_to",
|
|
@@ -609,15 +1329,15 @@
|
|
| 609 |
]
|
| 610 |
},
|
| 611 |
{
|
| 612 |
-
"id":"
|
| 613 |
-
"name":"
|
| 614 |
-
"provider_name":"
|
| 615 |
-
"cost":
|
| 616 |
-
"hf_id":
|
| 617 |
-
"size":
|
| 618 |
-
"type":"
|
| 619 |
-
"license":
|
| 620 |
-
"creation_date":
|
| 621 |
"tasks":[
|
| 622 |
"translation_from",
|
| 623 |
"translation_to",
|
|
@@ -708,6 +1428,26 @@
|
|
| 708 |
"mgsm"
|
| 709 |
]
|
| 710 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
{
|
| 712 |
"id":"openai\/gpt-4o-mini",
|
| 713 |
"name":"GPT-4o-mini",
|
|
@@ -728,6 +1468,86 @@
|
|
| 728 |
"mgsm"
|
| 729 |
]
|
| 730 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
{
|
| 732 |
"id":"qwen\/qwen3-235b-a22b",
|
| 733 |
"name":"Qwen3 235B A22B",
|
|
@@ -787,5 +1607,185 @@
|
|
| 787 |
"truthfulqa",
|
| 788 |
"mgsm"
|
| 789 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
}
|
| 791 |
]
|
|
|
|
| 1 |
[
|
| 2 |
+
{
|
| 3 |
+
"id":"aion-labs\/aion-1.0-mini",
|
| 4 |
+
"name":"Aion-1.0-Mini",
|
| 5 |
+
"provider_name":"AionLabs",
|
| 6 |
+
"cost":1.4,
|
| 7 |
+
"hf_id":"FuseAI\/FuseO1-DeepSeekR1-QwQ-SkyT1-32B-Preview",
|
| 8 |
+
"size":32763876352.0,
|
| 9 |
+
"type":"open-source",
|
| 10 |
+
"license":"Apache 2.0",
|
| 11 |
+
"creation_date":1737331200000.0,
|
| 12 |
+
"tasks":[
|
| 13 |
+
"translation_from",
|
| 14 |
+
"translation_to",
|
| 15 |
+
"classification",
|
| 16 |
+
"mmlu",
|
| 17 |
+
"arc",
|
| 18 |
+
"truthfulqa",
|
| 19 |
+
"mgsm"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id":"aion-labs\/aion-rp-llama-3.1-8b",
|
| 24 |
+
"name":"Aion-RP 1.0 (8B)",
|
| 25 |
+
"provider_name":"AionLabs",
|
| 26 |
+
"cost":0.2,
|
| 27 |
+
"hf_id":"aion-labs\/Aion-RP-Llama-3.1-8B",
|
| 28 |
+
"size":8030261248.0,
|
| 29 |
+
"type":"open-source",
|
| 30 |
+
"license":"Apache 2.0",
|
| 31 |
+
"creation_date":1731110400000,
|
| 32 |
+
"tasks":[
|
| 33 |
+
"translation_from",
|
| 34 |
+
"translation_to",
|
| 35 |
+
"classification",
|
| 36 |
+
"mmlu",
|
| 37 |
+
"arc",
|
| 38 |
+
"truthfulqa",
|
| 39 |
+
"mgsm"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
{
|
| 43 |
"id":"amazon\/nova-micro-v1",
|
| 44 |
"name":"Nova Micro 1.0",
|
|
|
|
| 59 |
"mgsm"
|
| 60 |
]
|
| 61 |
},
|
| 62 |
+
{
|
| 63 |
+
"id":"amazon\/nova-pro-v1",
|
| 64 |
+
"name":"Nova Pro 1.0",
|
| 65 |
+
"provider_name":"Amazon",
|
| 66 |
+
"cost":3.2,
|
| 67 |
+
"hf_id":null,
|
| 68 |
+
"size":null,
|
| 69 |
+
"type":"closed-source",
|
| 70 |
+
"license":null,
|
| 71 |
+
"creation_date":1733356800000,
|
| 72 |
+
"tasks":[
|
| 73 |
+
"translation_from",
|
| 74 |
+
"translation_to",
|
| 75 |
+
"classification",
|
| 76 |
+
"mmlu",
|
| 77 |
+
"arc",
|
| 78 |
+
"truthfulqa",
|
| 79 |
+
"mgsm"
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id":"anthracite-org\/magnum-v4-72b",
|
| 84 |
+
"name":"Magnum v4 72B",
|
| 85 |
+
"provider_name":"Magnum v4 72B",
|
| 86 |
+
"cost":3.0,
|
| 87 |
+
"hf_id":"anthracite-org\/magnum-v4-72b",
|
| 88 |
+
"size":72706203648.0,
|
| 89 |
+
"type":"open-source",
|
| 90 |
+
"license":"Apache 2.0",
|
| 91 |
+
"creation_date":1726790400000.0,
|
| 92 |
+
"tasks":[
|
| 93 |
+
"translation_from",
|
| 94 |
+
"translation_to",
|
| 95 |
+
"classification",
|
| 96 |
+
"mmlu",
|
| 97 |
+
"arc",
|
| 98 |
+
"truthfulqa",
|
| 99 |
+
"mgsm"
|
| 100 |
+
]
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"id":"anthropic\/claude-3-haiku",
|
| 104 |
+
"name":"Claude 3 Haiku",
|
| 105 |
+
"provider_name":"Anthropic",
|
| 106 |
+
"cost":1.25,
|
| 107 |
+
"hf_id":null,
|
| 108 |
+
"size":null,
|
| 109 |
+
"type":"closed-source",
|
| 110 |
+
"license":null,
|
| 111 |
+
"creation_date":1710288000000,
|
| 112 |
+
"tasks":[
|
| 113 |
+
"translation_from",
|
| 114 |
+
"translation_to",
|
| 115 |
+
"classification",
|
| 116 |
+
"mmlu",
|
| 117 |
+
"arc",
|
| 118 |
+
"truthfulqa",
|
| 119 |
+
"mgsm"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
{
|
| 123 |
"id":"anthropic\/claude-3.5-sonnet",
|
| 124 |
"name":"Claude 3.5 Sonnet",
|
|
|
|
| 179 |
"mgsm"
|
| 180 |
]
|
| 181 |
},
|
| 182 |
+
{
|
| 183 |
+
"id":"arcee-ai\/maestro-reasoning",
|
| 184 |
+
"name":"Maestro Reasoning",
|
| 185 |
+
"provider_name":"Arcee AI",
|
| 186 |
+
"cost":3.3,
|
| 187 |
+
"hf_id":null,
|
| 188 |
+
"size":null,
|
| 189 |
+
"type":"closed-source",
|
| 190 |
+
"license":null,
|
| 191 |
+
"creation_date":1746403200000.0,
|
| 192 |
+
"tasks":[
|
| 193 |
+
"translation_from",
|
| 194 |
+
"translation_to",
|
| 195 |
+
"classification",
|
| 196 |
+
"mmlu",
|
| 197 |
+
"arc",
|
| 198 |
+
"truthfulqa",
|
| 199 |
+
"mgsm"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"id":"cognitivecomputations\/dolphin3.0-r1-mistral-24b",
|
| 204 |
+
"name":"Dolphin3.0 R1 Mistral 24B",
|
| 205 |
+
"provider_name":"Dolphin3.0 R1 Mistral 24B (free)",
|
| 206 |
+
"cost":0.0,
|
| 207 |
+
"hf_id":"dphn\/Dolphin3.0-R1-Mistral-24B",
|
| 208 |
+
"size":23572423680.0,
|
| 209 |
+
"type":"open-source",
|
| 210 |
+
"license":"",
|
| 211 |
+
"creation_date":1738800000000.0,
|
| 212 |
+
"tasks":[
|
| 213 |
+
"translation_from",
|
| 214 |
+
"translation_to",
|
| 215 |
+
"classification",
|
| 216 |
+
"mmlu",
|
| 217 |
+
"arc",
|
| 218 |
+
"truthfulqa",
|
| 219 |
+
"mgsm"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"id":"cohere\/command",
|
| 224 |
+
"name":"Command",
|
| 225 |
+
"provider_name":"Cohere",
|
| 226 |
+
"cost":2.0,
|
| 227 |
+
"hf_id":null,
|
| 228 |
+
"size":null,
|
| 229 |
+
"type":"closed-source",
|
| 230 |
+
"license":null,
|
| 231 |
+
"creation_date":1710374400000.0,
|
| 232 |
+
"tasks":[
|
| 233 |
+
"translation_from",
|
| 234 |
+
"translation_to",
|
| 235 |
+
"classification",
|
| 236 |
+
"mmlu",
|
| 237 |
+
"arc",
|
| 238 |
+
"truthfulqa",
|
| 239 |
+
"mgsm"
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"id":"cohere\/command-r",
|
| 244 |
+
"name":"Command R",
|
| 245 |
+
"provider_name":"Cohere",
|
| 246 |
+
"cost":1.5,
|
| 247 |
+
"hf_id":null,
|
| 248 |
+
"size":null,
|
| 249 |
+
"type":"closed-source",
|
| 250 |
+
"license":null,
|
| 251 |
+
"creation_date":1710374400000.0,
|
| 252 |
+
"tasks":[
|
| 253 |
+
"translation_from",
|
| 254 |
+
"translation_to",
|
| 255 |
+
"classification",
|
| 256 |
+
"mmlu",
|
| 257 |
+
"arc",
|
| 258 |
+
"truthfulqa",
|
| 259 |
+
"mgsm"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"id":"cohere\/command-r7b-12-2024",
|
| 264 |
+
"name":"Command R7B (12-2024)",
|
| 265 |
+
"provider_name":"Cohere",
|
| 266 |
+
"cost":0.15,
|
| 267 |
+
"hf_id":null,
|
| 268 |
+
"size":null,
|
| 269 |
+
"type":"closed-source",
|
| 270 |
+
"license":null,
|
| 271 |
+
"creation_date":1734134400000.0,
|
| 272 |
+
"tasks":[
|
| 273 |
+
"translation_from",
|
| 274 |
+
"translation_to",
|
| 275 |
+
"classification",
|
| 276 |
+
"mmlu",
|
| 277 |
+
"arc",
|
| 278 |
+
"truthfulqa",
|
| 279 |
+
"mgsm"
|
| 280 |
+
]
|
| 281 |
+
},
|
| 282 |
{
|
| 283 |
"id":"deepseek\/deepseek-chat",
|
| 284 |
"name":"DeepSeek V3",
|
|
|
|
| 328 |
"size":684531386000.0,
|
| 329 |
"type":"open-source",
|
| 330 |
"license":"Mit",
|
| 331 |
+
"creation_date":1737331200000,
|
| 332 |
"tasks":[
|
| 333 |
"translation_from",
|
| 334 |
"translation_to",
|
|
|
|
| 379 |
"mgsm"
|
| 380 |
]
|
| 381 |
},
|
| 382 |
+
{
|
| 383 |
+
"id":"google\/gemini-2.0-flash-exp",
|
| 384 |
+
"name":"Gemini 2.0 Flash Experimental",
|
| 385 |
+
"provider_name":"Google",
|
| 386 |
+
"cost":0.0,
|
| 387 |
+
"hf_id":null,
|
| 388 |
+
"size":null,
|
| 389 |
+
"type":"closed-source",
|
| 390 |
+
"license":null,
|
| 391 |
+
"creation_date":1733875200000,
|
| 392 |
+
"tasks":[
|
| 393 |
+
"translation_from",
|
| 394 |
+
"translation_to",
|
| 395 |
+
"classification",
|
| 396 |
+
"mmlu",
|
| 397 |
+
"arc",
|
| 398 |
+
"truthfulqa",
|
| 399 |
+
"mgsm"
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
{
|
| 403 |
"id":"google\/gemini-2.0-flash-lite-001",
|
| 404 |
"name":"Gemini 2.0 Flash Lite",
|
|
|
|
| 439 |
"mgsm"
|
| 440 |
]
|
| 441 |
},
|
| 442 |
+
{
|
| 443 |
+
"id":"google\/gemini-2.5-flash-lite",
|
| 444 |
+
"name":"Gemini 2.5 Flash Lite",
|
| 445 |
+
"provider_name":"Google",
|
| 446 |
+
"cost":0.4,
|
| 447 |
+
"hf_id":null,
|
| 448 |
+
"size":null,
|
| 449 |
+
"type":"closed-source",
|
| 450 |
+
"license":null,
|
| 451 |
+
"creation_date":1753142400000,
|
| 452 |
+
"tasks":[
|
| 453 |
+
"translation_from",
|
| 454 |
+
"translation_to",
|
| 455 |
+
"classification",
|
| 456 |
+
"mmlu",
|
| 457 |
+
"arc",
|
| 458 |
+
"truthfulqa",
|
| 459 |
+
"mgsm"
|
| 460 |
+
]
|
| 461 |
+
},
|
| 462 |
{
|
| 463 |
"id":"google\/gemini-2.5-flash-lite-preview-06-17",
|
| 464 |
"name":"Gemini 2.5 Flash Lite Preview 06-17",
|
|
|
|
| 610 |
]
|
| 611 |
},
|
| 612 |
{
|
| 613 |
+
"id":"google\/gemini-pro-1.5",
|
| 614 |
+
"name":"Gemini 1.5 Pro",
|
| 615 |
"provider_name":"Google",
|
| 616 |
+
"cost":5.0,
|
| 617 |
+
"hf_id":null,
|
| 618 |
+
"size":null,
|
| 619 |
+
"type":"closed-source",
|
| 620 |
+
"license":null,
|
| 621 |
+
"creation_date":1712620800000.0,
|
| 622 |
"tasks":[
|
| 623 |
"translation_from",
|
| 624 |
"translation_to",
|
|
|
|
| 630 |
]
|
| 631 |
},
|
| 632 |
{
|
| 633 |
+
"id":"google\/gemma-2-9b-it",
|
| 634 |
+
"name":"Gemma 2 9B",
|
| 635 |
"provider_name":"Google",
|
| 636 |
+
"cost":0.0,
|
| 637 |
+
"hf_id":"google\/gemma-2-9b-it",
|
| 638 |
+
"size":9241705984.0,
|
| 639 |
+
"type":"open-source",
|
| 640 |
+
"license":"Gemma",
|
| 641 |
+
"creation_date":1719187200000.0,
|
| 642 |
"tasks":[
|
| 643 |
"translation_from",
|
| 644 |
+
"translation_to",
|
| 645 |
+
"classification",
|
| 646 |
+
"mmlu",
|
| 647 |
+
"arc",
|
| 648 |
+
"truthfulqa",
|
| 649 |
+
"mgsm"
|
| 650 |
]
|
| 651 |
},
|
| 652 |
{
|
| 653 |
+
"id":"google\/gemma-3-27b-it",
|
| 654 |
+
"name":"Gemma 3 27B",
|
| 655 |
+
"provider_name":"Google",
|
| 656 |
+
"cost":0.0,
|
| 657 |
+
"hf_id":"google\/gemma-3-27b-it",
|
| 658 |
+
"size":27432406640.0,
|
| 659 |
"type":"open-source",
|
| 660 |
+
"license":"Gemma",
|
| 661 |
+
"creation_date":1740787200000,
|
| 662 |
"tasks":[
|
| 663 |
"translation_from",
|
| 664 |
"translation_to",
|
|
|
|
| 670 |
]
|
| 671 |
},
|
| 672 |
{
|
| 673 |
+
"id":"google\/gemma-3n-e2b-it",
|
| 674 |
+
"name":"Gemma 3n 2B",
|
| 675 |
+
"provider_name":"Google",
|
| 676 |
+
"cost":0.0,
|
| 677 |
+
"hf_id":"google\/gemma-3n-E2B-it",
|
| 678 |
+
"size":5439438272.0,
|
| 679 |
"type":"open-source",
|
| 680 |
+
"license":"Gemma",
|
| 681 |
+
"creation_date":1749686400000,
|
| 682 |
"tasks":[
|
| 683 |
"translation_from",
|
| 684 |
"translation_to",
|
|
|
|
| 690 |
]
|
| 691 |
},
|
| 692 |
{
|
| 693 |
+
"id":"google\/gemma-3n-e4b-it",
|
| 694 |
+
"name":"Gemma 3n 4B",
|
| 695 |
+
"provider_name":"Google",
|
| 696 |
+
"cost":0.0,
|
| 697 |
+
"hf_id":"google\/gemma-3n-E4B-it",
|
| 698 |
+
"size":7849978192.0,
|
| 699 |
"type":"open-source",
|
| 700 |
+
"license":"Gemma",
|
| 701 |
+
"creation_date":1748908800000.0,
|
| 702 |
"tasks":[
|
| 703 |
"translation_from",
|
| 704 |
"translation_to",
|
|
|
|
| 710 |
]
|
| 711 |
},
|
| 712 |
{
|
| 713 |
+
"id":"google\/translate-v2",
|
| 714 |
+
"name":"Google Translate",
|
| 715 |
+
"provider_name":"Google",
|
| 716 |
+
"cost":20.0,
|
| 717 |
+
"hf_id":null,
|
| 718 |
+
"size":null,
|
| 719 |
+
"type":"closed-source",
|
| 720 |
+
"license":null,
|
| 721 |
+
"creation_date":null,
|
| 722 |
+
"tasks":[
|
| 723 |
+
"translation_from",
|
| 724 |
+
"translation_to"
|
| 725 |
+
]
|
| 726 |
+
},
|
| 727 |
+
{
|
| 728 |
+
"id":"gryphe\/mythomax-l2-13b",
|
| 729 |
+
"name":"MythoMax 13B",
|
| 730 |
+
"provider_name":"MythoMax 13B",
|
| 731 |
+
"cost":0.06,
|
| 732 |
+
"hf_id":"Gryphe\/MythoMax-L2-13b",
|
| 733 |
+
"size":null,
|
| 734 |
+
"type":"open-source",
|
| 735 |
+
"license":"Other",
|
| 736 |
+
"creation_date":1691625600000,
|
| 737 |
+
"tasks":[
|
| 738 |
+
"translation_from",
|
| 739 |
+
"translation_to",
|
| 740 |
+
"classification",
|
| 741 |
+
"mmlu",
|
| 742 |
+
"arc",
|
| 743 |
+
"truthfulqa",
|
| 744 |
+
"mgsm"
|
| 745 |
+
]
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"id":"inception\/mercury",
|
| 749 |
+
"name":"Mercury",
|
| 750 |
+
"provider_name":"Inception",
|
| 751 |
+
"cost":1.0,
|
| 752 |
+
"hf_id":null,
|
| 753 |
+
"size":null,
|
| 754 |
+
"type":"closed-source",
|
| 755 |
+
"license":null,
|
| 756 |
+
"creation_date":1750896000000,
|
| 757 |
+
"tasks":[
|
| 758 |
+
"translation_from",
|
| 759 |
+
"translation_to",
|
| 760 |
+
"classification",
|
| 761 |
+
"mmlu",
|
| 762 |
+
"arc",
|
| 763 |
+
"truthfulqa",
|
| 764 |
+
"mgsm"
|
| 765 |
+
]
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"id":"inflection\/inflection-3-productivity",
|
| 769 |
+
"name":"Inflection 3 Productivity",
|
| 770 |
+
"provider_name":"Inflection",
|
| 771 |
+
"cost":10.0,
|
| 772 |
+
"hf_id":null,
|
| 773 |
+
"size":null,
|
| 774 |
+
"type":"closed-source",
|
| 775 |
+
"license":null,
|
| 776 |
+
"creation_date":1728604800000,
|
| 777 |
+
"tasks":[
|
| 778 |
+
"translation_from",
|
| 779 |
+
"translation_to",
|
| 780 |
+
"classification",
|
| 781 |
+
"mmlu",
|
| 782 |
+
"arc",
|
| 783 |
+
"truthfulqa",
|
| 784 |
+
"mgsm"
|
| 785 |
+
]
|
| 786 |
+
},
|
| 787 |
+
{
|
| 788 |
+
"id":"meta-llama\/llama-3-70b-instruct",
|
| 789 |
+
"name":"Llama 3 70B Instruct",
|
| 790 |
+
"provider_name":"Meta",
|
| 791 |
+
"cost":0.4,
|
| 792 |
+
"hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
|
| 793 |
+
"size":70553706496.0,
|
| 794 |
+
"type":"open-source",
|
| 795 |
+
"license":"Llama3",
|
| 796 |
+
"creation_date":1713312000000,
|
| 797 |
+
"tasks":[
|
| 798 |
+
"translation_from",
|
| 799 |
+
"translation_to",
|
| 800 |
+
"classification",
|
| 801 |
+
"mmlu",
|
| 802 |
+
"arc",
|
| 803 |
+
"truthfulqa",
|
| 804 |
+
"mgsm"
|
| 805 |
+
]
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"id":"meta-llama\/llama-3-8b-instruct",
|
| 809 |
+
"name":"Llama 3 8B Instruct",
|
| 810 |
+
"provider_name":"Meta",
|
| 811 |
+
"cost":0.06,
|
| 812 |
+
"hf_id":"meta-llama\/Meta-Llama-3-8B-Instruct",
|
| 813 |
+
"size":8030261248.0,
|
| 814 |
+
"type":"open-source",
|
| 815 |
+
"license":"Llama3",
|
| 816 |
+
"creation_date":1713312000000.0,
|
| 817 |
+
"tasks":[
|
| 818 |
+
"translation_from",
|
| 819 |
+
"translation_to",
|
| 820 |
+
"classification",
|
| 821 |
+
"mmlu",
|
| 822 |
+
"arc",
|
| 823 |
+
"truthfulqa",
|
| 824 |
+
"mgsm"
|
| 825 |
+
]
|
| 826 |
+
},
|
| 827 |
+
{
|
| 828 |
+
"id":"meta-llama\/llama-3.1-405b-instruct",
|
| 829 |
+
"name":"Llama 3.1 405B Instruct",
|
| 830 |
+
"provider_name":"Meta",
|
| 831 |
+
"cost":0.0,
|
| 832 |
+
"hf_id":"meta-llama\/Llama-3.1-405B-Instruct",
|
| 833 |
+
"size":405853388800.0,
|
| 834 |
+
"type":"open-source",
|
| 835 |
+
"license":"Llama3.1",
|
| 836 |
+
"creation_date":1721088000000,
|
| 837 |
+
"tasks":[
|
| 838 |
+
"translation_from",
|
| 839 |
+
"translation_to",
|
| 840 |
+
"classification",
|
| 841 |
+
"mmlu",
|
| 842 |
+
"arc",
|
| 843 |
+
"truthfulqa",
|
| 844 |
+
"mgsm"
|
| 845 |
+
]
|
| 846 |
+
},
|
| 847 |
+
{
|
| 848 |
+
"id":"meta-llama\/llama-3.1-70b-instruct",
|
| 849 |
+
"name":"Llama 3.1 70B Instruct",
|
| 850 |
+
"provider_name":"Meta",
|
| 851 |
+
"cost":0.28,
|
| 852 |
+
"hf_id":"meta-llama\/Llama-3.1-70B-Instruct",
|
| 853 |
+
"size":70553706496.0,
|
| 854 |
+
"type":"open-source",
|
| 855 |
+
"license":"Llama3.1",
|
| 856 |
+
"creation_date":1721088000000,
|
| 857 |
+
"tasks":[
|
| 858 |
+
"translation_from",
|
| 859 |
+
"translation_to",
|
| 860 |
+
"classification",
|
| 861 |
+
"mmlu",
|
| 862 |
+
"arc",
|
| 863 |
+
"truthfulqa",
|
| 864 |
+
"mgsm"
|
| 865 |
+
]
|
| 866 |
+
},
|
| 867 |
+
{
|
| 868 |
+
"id":"meta-llama\/llama-3.1-8b-instruct",
|
| 869 |
+
"name":"Llama 3.1 8B Instruct",
|
| 870 |
+
"provider_name":"Meta",
|
| 871 |
"cost":0.0,
|
| 872 |
"hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
|
| 873 |
"size":8030261248.0,
|
|
|
|
| 876 |
"creation_date":1721260800000.0,
|
| 877 |
"tasks":null
|
| 878 |
},
|
| 879 |
+
{
|
| 880 |
+
"id":"meta-llama\/llama-3.2-11b-vision-instruct",
|
| 881 |
+
"name":"Llama 3.2 11B Vision Instruct",
|
| 882 |
+
"provider_name":"Meta",
|
| 883 |
+
"cost":0.0,
|
| 884 |
+
"hf_id":"meta-llama\/Llama-3.2-11B-Vision-Instruct",
|
| 885 |
+
"size":10670220835.0,
|
| 886 |
+
"type":"open-source",
|
| 887 |
+
"license":"Llama3.2",
|
| 888 |
+
"creation_date":1726617600000,
|
| 889 |
+
"tasks":[
|
| 890 |
+
"translation_from",
|
| 891 |
+
"translation_to",
|
| 892 |
+
"classification",
|
| 893 |
+
"mmlu",
|
| 894 |
+
"arc",
|
| 895 |
+
"truthfulqa",
|
| 896 |
+
"mgsm"
|
| 897 |
+
]
|
| 898 |
+
},
|
| 899 |
{
|
| 900 |
"id":"meta-llama\/llama-3.2-1b-instruct",
|
| 901 |
"name":"Llama 3.2 1B Instruct",
|
|
|
|
| 908 |
"creation_date":1726617600000.0,
|
| 909 |
"tasks":null
|
| 910 |
},
|
| 911 |
+
{
|
| 912 |
+
"id":"meta-llama\/llama-3.2-3b-instruct",
|
| 913 |
+
"name":"Llama 3.2 3B Instruct",
|
| 914 |
+
"provider_name":"Meta",
|
| 915 |
+
"cost":0.0,
|
| 916 |
+
"hf_id":"meta-llama\/Llama-3.2-3B-Instruct",
|
| 917 |
+
"size":3212749824.0,
|
| 918 |
+
"type":"open-source",
|
| 919 |
+
"license":"Llama3.2",
|
| 920 |
+
"creation_date":1726617600000.0,
|
| 921 |
+
"tasks":[
|
| 922 |
+
"translation_from",
|
| 923 |
+
"translation_to",
|
| 924 |
+
"classification",
|
| 925 |
+
"mmlu",
|
| 926 |
+
"arc",
|
| 927 |
+
"truthfulqa",
|
| 928 |
+
"mgsm"
|
| 929 |
+
]
|
| 930 |
+
},
|
| 931 |
{
|
| 932 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
| 933 |
"name":"Llama 3.3 70B Instruct",
|
|
|
|
| 969 |
]
|
| 970 |
},
|
| 971 |
{
|
| 972 |
+
"id":"meta-llama\/llama-guard-4-12b",
|
| 973 |
+
"name":"Llama Guard 4 12B",
|
| 974 |
+
"provider_name":"Meta",
|
| 975 |
+
"cost":0.05,
|
| 976 |
+
"hf_id":"meta-llama\/Llama-Guard-4-12B",
|
| 977 |
+
"size":12001097216.0,
|
| 978 |
+
"type":"open-source",
|
| 979 |
+
"license":"Other",
|
| 980 |
+
"creation_date":1745366400000,
|
| 981 |
+
"tasks":[
|
| 982 |
+
"translation_from",
|
| 983 |
+
"translation_to",
|
| 984 |
+
"classification",
|
| 985 |
+
"mmlu",
|
| 986 |
+
"arc",
|
| 987 |
+
"truthfulqa",
|
| 988 |
+
"mgsm"
|
| 989 |
+
]
|
| 990 |
+
},
|
| 991 |
+
{
|
| 992 |
+
"id":"microsoft\/phi-3.5-mini-128k-instruct",
|
| 993 |
+
"name":"Phi-3.5 Mini 128K Instruct",
|
| 994 |
+
"provider_name":"Microsoft",
|
| 995 |
+
"cost":0.1,
|
| 996 |
+
"hf_id":"microsoft\/Phi-3.5-mini-instruct",
|
| 997 |
+
"size":3821079552.0,
|
| 998 |
+
"type":"open-source",
|
| 999 |
+
"license":"Mit",
|
| 1000 |
+
"creation_date":1723766400000.0,
|
| 1001 |
+
"tasks":[
|
| 1002 |
+
"translation_from",
|
| 1003 |
+
"translation_to",
|
| 1004 |
+
"classification",
|
| 1005 |
+
"mmlu",
|
| 1006 |
+
"arc",
|
| 1007 |
+
"truthfulqa",
|
| 1008 |
+
"mgsm"
|
| 1009 |
+
]
|
| 1010 |
+
},
|
| 1011 |
+
{
|
| 1012 |
+
"id":"microsoft\/phi-4",
|
| 1013 |
+
"name":"Phi 4",
|
| 1014 |
+
"provider_name":"Microsoft",
|
| 1015 |
+
"cost":0.14,
|
| 1016 |
+
"hf_id":"microsoft\/phi-4",
|
| 1017 |
+
"size":14659507200.0,
|
| 1018 |
+
"type":"open-source",
|
| 1019 |
+
"license":"Mit",
|
| 1020 |
+
"creation_date":1733875200000,
|
| 1021 |
+
"tasks":[
|
| 1022 |
+
"translation_from",
|
| 1023 |
+
"translation_to",
|
| 1024 |
+
"classification",
|
| 1025 |
+
"mmlu",
|
| 1026 |
+
"arc",
|
| 1027 |
+
"truthfulqa",
|
| 1028 |
+
"mgsm"
|
| 1029 |
+
]
|
| 1030 |
+
},
|
| 1031 |
+
{
|
| 1032 |
+
"id":"microsoft\/phi-4-multimodal-instruct",
|
| 1033 |
+
"name":"Phi 4 Multimodal Instruct",
|
| 1034 |
+
"provider_name":"Microsoft",
|
| 1035 |
+
"cost":0.1,
|
| 1036 |
+
"hf_id":"microsoft\/Phi-4-multimodal-instruct",
|
| 1037 |
+
"size":5574460384.0,
|
| 1038 |
+
"type":"open-source",
|
| 1039 |
+
"license":"Mit",
|
| 1040 |
+
"creation_date":1740355200000,
|
| 1041 |
+
"tasks":[
|
| 1042 |
+
"translation_from",
|
| 1043 |
+
"translation_to",
|
| 1044 |
+
"classification",
|
| 1045 |
+
"mmlu",
|
| 1046 |
+
"arc",
|
| 1047 |
+
"truthfulqa",
|
| 1048 |
+
"mgsm"
|
| 1049 |
+
]
|
| 1050 |
+
},
|
| 1051 |
+
{
|
| 1052 |
+
"id":"microsoft\/wizardlm-2-8x22b",
|
| 1053 |
+
"name":"WizardLM-2 8x22B",
|
| 1054 |
+
"provider_name":"WizardLM-2 8x22B",
|
| 1055 |
+
"cost":0.48,
|
| 1056 |
+
"hf_id":null,
|
| 1057 |
+
"size":null,
|
| 1058 |
+
"type":"closed-source",
|
| 1059 |
+
"license":null,
|
| 1060 |
+
"creation_date":1713225600000,
|
| 1061 |
+
"tasks":[
|
| 1062 |
+
"translation_from",
|
| 1063 |
+
"translation_to",
|
| 1064 |
+
"classification",
|
| 1065 |
+
"mmlu",
|
| 1066 |
+
"arc",
|
| 1067 |
+
"truthfulqa",
|
| 1068 |
+
"mgsm"
|
| 1069 |
+
]
|
| 1070 |
+
},
|
| 1071 |
+
{
|
| 1072 |
+
"id":"mistralai\/codestral-2501",
|
| 1073 |
+
"name":"Codestral 2501",
|
| 1074 |
+
"provider_name":"Mistral",
|
| 1075 |
+
"cost":0.9,
|
| 1076 |
+
"hf_id":null,
|
| 1077 |
+
"size":null,
|
| 1078 |
+
"type":"closed-source",
|
| 1079 |
+
"license":null,
|
| 1080 |
+
"creation_date":1736812800000.0,
|
| 1081 |
+
"tasks":[
|
| 1082 |
+
"translation_from",
|
| 1083 |
+
"translation_to",
|
| 1084 |
+
"classification",
|
| 1085 |
+
"mmlu",
|
| 1086 |
+
"arc",
|
| 1087 |
+
"truthfulqa",
|
| 1088 |
+
"mgsm"
|
| 1089 |
+
]
|
| 1090 |
+
},
|
| 1091 |
+
{
|
| 1092 |
+
"id":"mistralai\/devstral-small-2505",
|
| 1093 |
+
"name":"Devstral Small 2505",
|
| 1094 |
+
"provider_name":"Mistral",
|
| 1095 |
+
"cost":0.0,
|
| 1096 |
+
"hf_id":"mistralai\/Devstral-Small-2505",
|
| 1097 |
+
"size":23572403200.0,
|
| 1098 |
+
"type":"open-source",
|
| 1099 |
+
"license":"Apache 2.0",
|
| 1100 |
+
"creation_date":1747008000000.0,
|
| 1101 |
+
"tasks":[
|
| 1102 |
+
"translation_from",
|
| 1103 |
+
"translation_to",
|
| 1104 |
+
"classification",
|
| 1105 |
+
"mmlu",
|
| 1106 |
+
"arc",
|
| 1107 |
+
"truthfulqa",
|
| 1108 |
+
"mgsm"
|
| 1109 |
+
]
|
| 1110 |
+
},
|
| 1111 |
+
{
|
| 1112 |
+
"id":"mistralai\/magistral-small-2506",
|
| 1113 |
+
"name":"Magistral Small 2506",
|
| 1114 |
+
"provider_name":"Mistral",
|
| 1115 |
+
"cost":1.5,
|
| 1116 |
+
"hf_id":"mistralai\/Magistral-Small-2506",
|
| 1117 |
+
"size":23572403200.0,
|
| 1118 |
+
"type":"open-source",
|
| 1119 |
+
"license":"Apache 2.0",
|
| 1120 |
+
"creation_date":1748995200000.0,
|
| 1121 |
+
"tasks":[
|
| 1122 |
+
"translation_from",
|
| 1123 |
+
"translation_to",
|
| 1124 |
+
"classification",
|
| 1125 |
+
"mmlu",
|
| 1126 |
+
"arc",
|
| 1127 |
+
"truthfulqa",
|
| 1128 |
+
"mgsm"
|
| 1129 |
+
]
|
| 1130 |
+
},
|
| 1131 |
+
{
|
| 1132 |
+
"id":"mistralai\/ministral-8b",
|
| 1133 |
+
"name":"Ministral 8B",
|
| 1134 |
+
"provider_name":"Mistral",
|
| 1135 |
+
"cost":0.1,
|
| 1136 |
+
"hf_id":null,
|
| 1137 |
+
"size":null,
|
| 1138 |
+
"type":"closed-source",
|
| 1139 |
+
"license":null,
|
| 1140 |
+
"creation_date":1729123200000,
|
| 1141 |
+
"tasks":[
|
| 1142 |
+
"translation_from",
|
| 1143 |
+
"translation_to",
|
| 1144 |
+
"classification",
|
| 1145 |
+
"mmlu",
|
| 1146 |
+
"arc",
|
| 1147 |
+
"truthfulqa",
|
| 1148 |
+
"mgsm"
|
| 1149 |
+
]
|
| 1150 |
+
},
|
| 1151 |
+
{
|
| 1152 |
+
"id":"mistralai\/mistral-7b-instruct",
|
| 1153 |
+
"name":"Mistral 7B Instruct",
|
| 1154 |
+
"provider_name":"Mistral",
|
| 1155 |
+
"cost":0.0,
|
| 1156 |
+
"hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
|
| 1157 |
+
"size":7248023552.0,
|
| 1158 |
+
"type":"open-source",
|
| 1159 |
+
"license":"Apache 2.0",
|
| 1160 |
+
"creation_date":1716336000000.0,
|
| 1161 |
+
"tasks":[
|
| 1162 |
+
"translation_from",
|
| 1163 |
+
"translation_to",
|
| 1164 |
+
"classification",
|
| 1165 |
+
"mmlu",
|
| 1166 |
+
"arc",
|
| 1167 |
+
"truthfulqa",
|
| 1168 |
+
"mgsm"
|
| 1169 |
+
]
|
| 1170 |
+
},
|
| 1171 |
+
{
|
| 1172 |
+
"id":"mistralai\/mistral-medium-3",
|
| 1173 |
+
"name":"Mistral Medium 3",
|
| 1174 |
+
"provider_name":"Mistral",
|
| 1175 |
+
"cost":2.0,
|
| 1176 |
+
"hf_id":null,
|
| 1177 |
+
"size":null,
|
| 1178 |
+
"type":"closed-source",
|
| 1179 |
+
"license":null,
|
| 1180 |
+
"creation_date":1746576000000.0,
|
| 1181 |
+
"tasks":[
|
| 1182 |
+
"translation_from",
|
| 1183 |
+
"translation_to",
|
| 1184 |
+
"classification",
|
| 1185 |
+
"mmlu",
|
| 1186 |
+
"arc",
|
| 1187 |
+
"truthfulqa",
|
| 1188 |
+
"mgsm"
|
| 1189 |
+
]
|
| 1190 |
+
},
|
| 1191 |
+
{
|
| 1192 |
+
"id":"mistralai\/mistral-nemo",
|
| 1193 |
+
"name":"Mistral Nemo",
|
| 1194 |
+
"provider_name":"Mistral",
|
| 1195 |
+
"cost":0.0,
|
| 1196 |
+
"hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
|
| 1197 |
+
"size":12247782400.0,
|
| 1198 |
+
"type":"open-source",
|
| 1199 |
+
"license":"Apache 2.0",
|
| 1200 |
+
"creation_date":1721174400000,
|
| 1201 |
+
"tasks":[
|
| 1202 |
+
"translation_from",
|
| 1203 |
+
"translation_to",
|
| 1204 |
+
"classification",
|
| 1205 |
+
"mmlu",
|
| 1206 |
+
"arc",
|
| 1207 |
+
"truthfulqa",
|
| 1208 |
+
"mgsm"
|
| 1209 |
+
]
|
| 1210 |
+
},
|
| 1211 |
+
{
|
| 1212 |
+
"id":"mistralai\/mistral-saba",
|
| 1213 |
+
"name":"Saba",
|
| 1214 |
+
"provider_name":"Mistral",
|
| 1215 |
+
"cost":0.6,
|
| 1216 |
+
"hf_id":null,
|
| 1217 |
+
"size":null,
|
| 1218 |
+
"type":"closed-source",
|
| 1219 |
+
"license":null,
|
| 1220 |
+
"creation_date":1739750400000,
|
| 1221 |
+
"tasks":[
|
| 1222 |
+
"translation_from",
|
| 1223 |
+
"translation_to",
|
| 1224 |
+
"classification",
|
| 1225 |
+
"mmlu",
|
| 1226 |
+
"arc",
|
| 1227 |
+
"truthfulqa",
|
| 1228 |
+
"mgsm"
|
| 1229 |
+
]
|
| 1230 |
+
},
|
| 1231 |
+
{
|
| 1232 |
+
"id":"mistralai\/mistral-small-3.1-24b-instruct",
|
| 1233 |
+
"name":"Mistral Small 3.1 24B",
|
| 1234 |
+
"provider_name":"Mistral",
|
| 1235 |
+
"cost":0.0,
|
| 1236 |
+
"hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
|
| 1237 |
+
"size":24011361280.0,
|
| 1238 |
+
"type":"open-source",
|
| 1239 |
+
"license":"Apache 2.0",
|
| 1240 |
+
"creation_date":1741651200000,
|
| 1241 |
+
"tasks":[
|
| 1242 |
+
"translation_from",
|
| 1243 |
+
"translation_to",
|
| 1244 |
+
"classification",
|
| 1245 |
+
"mmlu",
|
| 1246 |
+
"arc",
|
| 1247 |
+
"truthfulqa",
|
| 1248 |
+
"mgsm"
|
| 1249 |
+
]
|
| 1250 |
+
},
|
| 1251 |
+
{
|
| 1252 |
+
"id":"mistralai\/mistral-tiny",
|
| 1253 |
+
"name":"Mistral Tiny",
|
| 1254 |
+
"provider_name":"Mistral Tiny",
|
| 1255 |
+
"cost":0.25,
|
| 1256 |
+
"hf_id":null,
|
| 1257 |
+
"size":null,
|
| 1258 |
+
"type":"closed-source",
|
| 1259 |
+
"license":null,
|
| 1260 |
+
"creation_date":1704844800000.0,
|
| 1261 |
"tasks":[
|
| 1262 |
"translation_from",
|
| 1263 |
"translation_to",
|
|
|
|
| 1269 |
]
|
| 1270 |
},
|
| 1271 |
{
|
| 1272 |
+
"id":"mistralai\/mixtral-8x22b-instruct",
|
| 1273 |
+
"name":"Mixtral 8x22B Instruct",
|
| 1274 |
+
"provider_name":"Mistral",
|
| 1275 |
+
"cost":0.9,
|
| 1276 |
+
"hf_id":"mistralai\/Mixtral-8x22B-Instruct-v0.1",
|
| 1277 |
+
"size":140630071296.0,
|
| 1278 |
"type":"open-source",
|
| 1279 |
+
"license":"Apache 2.0",
|
| 1280 |
+
"creation_date":1713225600000.0,
|
| 1281 |
"tasks":[
|
| 1282 |
"translation_from",
|
| 1283 |
"translation_to",
|
|
|
|
| 1289 |
]
|
| 1290 |
},
|
| 1291 |
{
|
| 1292 |
+
"id":"mistralai\/pixtral-12b",
|
| 1293 |
+
"name":"Pixtral 12B",
|
| 1294 |
"provider_name":"Mistral",
|
| 1295 |
+
"cost":0.1,
|
| 1296 |
+
"hf_id":"mistralai\/Pixtral-12B-2409",
|
| 1297 |
+
"size":null,
|
| 1298 |
"type":"open-source",
|
| 1299 |
"license":"Apache 2.0",
|
| 1300 |
+
"creation_date":1726012800000.0,
|
| 1301 |
"tasks":[
|
| 1302 |
"translation_from",
|
| 1303 |
"translation_to",
|
|
|
|
| 1309 |
]
|
| 1310 |
},
|
| 1311 |
{
|
| 1312 |
+
"id":"moonshotai\/kimi-k2",
|
| 1313 |
+
"name":"Kimi K2",
|
| 1314 |
+
"provider_name":"MoonshotAI",
|
| 1315 |
+
"cost":0.0,
|
| 1316 |
+
"hf_id":"moonshotai\/Kimi-K2-Instruct",
|
| 1317 |
"size":null,
|
| 1318 |
+
"type":"open-source",
|
| 1319 |
+
"license":"Other",
|
| 1320 |
+
"creation_date":1752192000000,
|
| 1321 |
"tasks":[
|
| 1322 |
"translation_from",
|
| 1323 |
"translation_to",
|
|
|
|
| 1329 |
]
|
| 1330 |
},
|
| 1331 |
{
|
| 1332 |
+
"id":"morph\/morph-v3-fast",
|
| 1333 |
+
"name":"Morph V3 Fast",
|
| 1334 |
+
"provider_name":"Morph",
|
| 1335 |
+
"cost":2.7,
|
| 1336 |
+
"hf_id":null,
|
| 1337 |
+
"size":null,
|
| 1338 |
+
"type":"closed-source",
|
| 1339 |
+
"license":null,
|
| 1340 |
+
"creation_date":1751846400000.0,
|
| 1341 |
"tasks":[
|
| 1342 |
"translation_from",
|
| 1343 |
"translation_to",
|
|
|
|
| 1428 |
"mgsm"
|
| 1429 |
]
|
| 1430 |
},
|
| 1431 |
+
{
|
| 1432 |
+
"id":"openai\/gpt-4o-2024-11-20",
|
| 1433 |
+
"name":"GPT-4o (2024-11-20)",
|
| 1434 |
+
"provider_name":"OpenAI",
|
| 1435 |
+
"cost":10.0,
|
| 1436 |
+
"hf_id":null,
|
| 1437 |
+
"size":null,
|
| 1438 |
+
"type":"closed-source",
|
| 1439 |
+
"license":null,
|
| 1440 |
+
"creation_date":1732060800000,
|
| 1441 |
+
"tasks":[
|
| 1442 |
+
"translation_from",
|
| 1443 |
+
"translation_to",
|
| 1444 |
+
"classification",
|
| 1445 |
+
"mmlu",
|
| 1446 |
+
"arc",
|
| 1447 |
+
"truthfulqa",
|
| 1448 |
+
"mgsm"
|
| 1449 |
+
]
|
| 1450 |
+
},
|
| 1451 |
{
|
| 1452 |
"id":"openai\/gpt-4o-mini",
|
| 1453 |
"name":"GPT-4o-mini",
|
|
|
|
| 1468 |
"mgsm"
|
| 1469 |
]
|
| 1470 |
},
|
| 1471 |
+
{
|
| 1472 |
+
"id":"perplexity\/r1-1776",
|
| 1473 |
+
"name":"R1 1776",
|
| 1474 |
+
"provider_name":"Perplexity",
|
| 1475 |
+
"cost":8.0,
|
| 1476 |
+
"hf_id":"perplexity-ai\/r1-1776",
|
| 1477 |
+
"size":671026419200.0,
|
| 1478 |
+
"type":"open-source",
|
| 1479 |
+
"license":"Mit",
|
| 1480 |
+
"creation_date":1739836800000.0,
|
| 1481 |
+
"tasks":[
|
| 1482 |
+
"translation_from",
|
| 1483 |
+
"translation_to",
|
| 1484 |
+
"classification",
|
| 1485 |
+
"mmlu",
|
| 1486 |
+
"arc",
|
| 1487 |
+
"truthfulqa",
|
| 1488 |
+
"mgsm"
|
| 1489 |
+
]
|
| 1490 |
+
},
|
| 1491 |
+
{
|
| 1492 |
+
"id":"qwen\/qwen-2.5-72b-instruct",
|
| 1493 |
+
"name":"Qwen2.5 72B Instruct",
|
| 1494 |
+
"provider_name":"Qwen2.5 72B Instruct (free)",
|
| 1495 |
+
"cost":0.0,
|
| 1496 |
+
"hf_id":"Qwen\/Qwen2.5-72B-Instruct",
|
| 1497 |
+
"size":72706203648.0,
|
| 1498 |
+
"type":"open-source",
|
| 1499 |
+
"license":"Other",
|
| 1500 |
+
"creation_date":1726444800000.0,
|
| 1501 |
+
"tasks":[
|
| 1502 |
+
"translation_from",
|
| 1503 |
+
"translation_to",
|
| 1504 |
+
"classification",
|
| 1505 |
+
"mmlu",
|
| 1506 |
+
"arc",
|
| 1507 |
+
"truthfulqa",
|
| 1508 |
+
"mgsm"
|
| 1509 |
+
]
|
| 1510 |
+
},
|
| 1511 |
+
{
|
| 1512 |
+
"id":"qwen\/qwen-2.5-7b-instruct",
|
| 1513 |
+
"name":"Qwen2.5 7B Instruct",
|
| 1514 |
+
"provider_name":"Qwen2.5 7B Instruct",
|
| 1515 |
+
"cost":0.1,
|
| 1516 |
+
"hf_id":"Qwen\/Qwen2.5-7B-Instruct",
|
| 1517 |
+
"size":7615616512.0,
|
| 1518 |
+
"type":"open-source",
|
| 1519 |
+
"license":"Apache 2.0",
|
| 1520 |
+
"creation_date":1726444800000,
|
| 1521 |
+
"tasks":[
|
| 1522 |
+
"translation_from",
|
| 1523 |
+
"translation_to",
|
| 1524 |
+
"classification",
|
| 1525 |
+
"mmlu",
|
| 1526 |
+
"arc",
|
| 1527 |
+
"truthfulqa",
|
| 1528 |
+
"mgsm"
|
| 1529 |
+
]
|
| 1530 |
+
},
|
| 1531 |
+
{
|
| 1532 |
+
"id":"qwen\/qwen-2.5-coder-32b-instruct",
|
| 1533 |
+
"name":"Qwen2.5 Coder 32B Instruct",
|
| 1534 |
+
"provider_name":"Qwen2.5 Coder 32B Instruct (free)",
|
| 1535 |
+
"cost":0.0,
|
| 1536 |
+
"hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
|
| 1537 |
+
"size":32763876352.0,
|
| 1538 |
+
"type":"open-source",
|
| 1539 |
+
"license":"Apache 2.0",
|
| 1540 |
+
"creation_date":1730851200000.0,
|
| 1541 |
+
"tasks":[
|
| 1542 |
+
"translation_from",
|
| 1543 |
+
"translation_to",
|
| 1544 |
+
"classification",
|
| 1545 |
+
"mmlu",
|
| 1546 |
+
"arc",
|
| 1547 |
+
"truthfulqa",
|
| 1548 |
+
"mgsm"
|
| 1549 |
+
]
|
| 1550 |
+
},
|
| 1551 |
{
|
| 1552 |
"id":"qwen\/qwen3-235b-a22b",
|
| 1553 |
"name":"Qwen3 235B A22B",
|
|
|
|
| 1607 |
"truthfulqa",
|
| 1608 |
"mgsm"
|
| 1609 |
]
|
| 1610 |
+
},
|
| 1611 |
+
{
|
| 1612 |
+
"id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
| 1613 |
+
"name":"Typhoon2 70B Instruct",
|
| 1614 |
+
"provider_name":"Typhoon2 70B Instruct",
|
| 1615 |
+
"cost":0.88,
|
| 1616 |
+
"hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
| 1617 |
+
"size":70553706496.0,
|
| 1618 |
+
"type":"open-source",
|
| 1619 |
+
"license":"Llama3.1",
|
| 1620 |
+
"creation_date":1734220800000.0,
|
| 1621 |
+
"tasks":[
|
| 1622 |
+
"translation_from",
|
| 1623 |
+
"translation_to",
|
| 1624 |
+
"classification",
|
| 1625 |
+
"mmlu",
|
| 1626 |
+
"arc",
|
| 1627 |
+
"truthfulqa",
|
| 1628 |
+
"mgsm"
|
| 1629 |
+
]
|
| 1630 |
+
},
|
| 1631 |
+
{
|
| 1632 |
+
"id":"sophosympatheia\/midnight-rose-70b",
|
| 1633 |
+
"name":"Midnight Rose 70B",
|
| 1634 |
+
"provider_name":"Midnight Rose 70B",
|
| 1635 |
+
"cost":0.8,
|
| 1636 |
+
"hf_id":"sophosympatheia\/Midnight-Rose-70B-v2.0.3",
|
| 1637 |
+
"size":68976648192.0,
|
| 1638 |
+
"type":"open-source",
|
| 1639 |
+
"license":"Llama2",
|
| 1640 |
+
"creation_date":1707004800000,
|
| 1641 |
+
"tasks":[
|
| 1642 |
+
"translation_from",
|
| 1643 |
+
"translation_to",
|
| 1644 |
+
"classification",
|
| 1645 |
+
"mmlu",
|
| 1646 |
+
"arc",
|
| 1647 |
+
"truthfulqa",
|
| 1648 |
+
"mgsm"
|
| 1649 |
+
]
|
| 1650 |
+
},
|
| 1651 |
+
{
|
| 1652 |
+
"id":"switchpoint\/router",
|
| 1653 |
+
"name":"Switchpoint Router",
|
| 1654 |
+
"provider_name":"Switchpoint Router",
|
| 1655 |
+
"cost":3.4,
|
| 1656 |
+
"hf_id":null,
|
| 1657 |
+
"size":null,
|
| 1658 |
+
"type":"closed-source",
|
| 1659 |
+
"license":null,
|
| 1660 |
+
"creation_date":1752192000000.0,
|
| 1661 |
+
"tasks":[
|
| 1662 |
+
"translation_from",
|
| 1663 |
+
"translation_to",
|
| 1664 |
+
"classification",
|
| 1665 |
+
"mmlu",
|
| 1666 |
+
"arc",
|
| 1667 |
+
"truthfulqa",
|
| 1668 |
+
"mgsm"
|
| 1669 |
+
]
|
| 1670 |
+
},
|
| 1671 |
+
{
|
| 1672 |
+
"id":"thedrummer\/anubis-pro-105b-v1",
|
| 1673 |
+
"name":"Anubis Pro 105B V1",
|
| 1674 |
+
"provider_name":"TheDrummer",
|
| 1675 |
+
"cost":1.0,
|
| 1676 |
+
"hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
|
| 1677 |
+
"size":104779882496.0,
|
| 1678 |
+
"type":"open-source",
|
| 1679 |
+
"license":"Other",
|
| 1680 |
+
"creation_date":1738454400000.0,
|
| 1681 |
+
"tasks":[
|
| 1682 |
+
"translation_from",
|
| 1683 |
+
"translation_to",
|
| 1684 |
+
"classification",
|
| 1685 |
+
"mmlu",
|
| 1686 |
+
"arc",
|
| 1687 |
+
"truthfulqa",
|
| 1688 |
+
"mgsm"
|
| 1689 |
+
]
|
| 1690 |
+
},
|
| 1691 |
+
{
|
| 1692 |
+
"id":"thedrummer\/skyfall-36b-v2",
|
| 1693 |
+
"name":"Skyfall 36B V2",
|
| 1694 |
+
"provider_name":"TheDrummer",
|
| 1695 |
+
"cost":0.07,
|
| 1696 |
+
"hf_id":"TheDrummer\/Skyfall-36B-v2",
|
| 1697 |
+
"size":36910535680.0,
|
| 1698 |
+
"type":"open-source",
|
| 1699 |
+
"license":"Other",
|
| 1700 |
+
"creation_date":1738540800000.0,
|
| 1701 |
+
"tasks":[
|
| 1702 |
+
"translation_from",
|
| 1703 |
+
"translation_to",
|
| 1704 |
+
"classification",
|
| 1705 |
+
"mmlu",
|
| 1706 |
+
"arc",
|
| 1707 |
+
"truthfulqa",
|
| 1708 |
+
"mgsm"
|
| 1709 |
+
]
|
| 1710 |
+
},
|
| 1711 |
+
{
|
| 1712 |
+
"id":"thedrummer\/unslopnemo-12b",
|
| 1713 |
+
"name":"UnslopNemo 12B",
|
| 1714 |
+
"provider_name":"TheDrummer",
|
| 1715 |
+
"cost":0.4,
|
| 1716 |
+
"hf_id":"TheDrummer\/UnslopNemo-12B-v4.1",
|
| 1717 |
+
"size":12247782400.0,
|
| 1718 |
+
"type":"open-source",
|
| 1719 |
+
"license":"",
|
| 1720 |
+
"creation_date":1729641600000.0,
|
| 1721 |
+
"tasks":[
|
| 1722 |
+
"translation_from",
|
| 1723 |
+
"translation_to",
|
| 1724 |
+
"classification",
|
| 1725 |
+
"mmlu",
|
| 1726 |
+
"arc",
|
| 1727 |
+
"truthfulqa",
|
| 1728 |
+
"mgsm"
|
| 1729 |
+
]
|
| 1730 |
+
},
|
| 1731 |
+
{
|
| 1732 |
+
"id":"thedrummer\/valkyrie-49b-v1",
|
| 1733 |
+
"name":"Valkyrie 49B V1",
|
| 1734 |
+
"provider_name":"TheDrummer",
|
| 1735 |
+
"cost":1.0,
|
| 1736 |
+
"hf_id":"TheDrummer\/Valkyrie-49B-v1",
|
| 1737 |
+
"size":49867145216.0,
|
| 1738 |
+
"type":"open-source",
|
| 1739 |
+
"license":"",
|
| 1740 |
+
"creation_date":1747440000000,
|
| 1741 |
+
"tasks":[
|
| 1742 |
+
"translation_from",
|
| 1743 |
+
"translation_to",
|
| 1744 |
+
"classification",
|
| 1745 |
+
"mmlu",
|
| 1746 |
+
"arc",
|
| 1747 |
+
"truthfulqa",
|
| 1748 |
+
"mgsm"
|
| 1749 |
+
]
|
| 1750 |
+
},
|
| 1751 |
+
{
|
| 1752 |
+
"id":"x-ai\/grok-3-beta",
|
| 1753 |
+
"name":"Grok 3 Beta",
|
| 1754 |
+
"provider_name":"xAI",
|
| 1755 |
+
"cost":15.0,
|
| 1756 |
+
"hf_id":null,
|
| 1757 |
+
"size":null,
|
| 1758 |
+
"type":"closed-source",
|
| 1759 |
+
"license":null,
|
| 1760 |
+
"creation_date":1744156800000.0,
|
| 1761 |
+
"tasks":[
|
| 1762 |
+
"translation_from",
|
| 1763 |
+
"translation_to",
|
| 1764 |
+
"classification",
|
| 1765 |
+
"mmlu",
|
| 1766 |
+
"arc",
|
| 1767 |
+
"truthfulqa",
|
| 1768 |
+
"mgsm"
|
| 1769 |
+
]
|
| 1770 |
+
},
|
| 1771 |
+
{
|
| 1772 |
+
"id":"z-ai\/glm-4.5-air",
|
| 1773 |
+
"name":"GLM 4.5 Air",
|
| 1774 |
+
"provider_name":"Z.AI",
|
| 1775 |
+
"cost":0.0,
|
| 1776 |
+
"hf_id":"zai-org\/GLM-4.5-Air",
|
| 1777 |
+
"size":110468824832.0,
|
| 1778 |
+
"type":"open-source",
|
| 1779 |
+
"license":"Mit",
|
| 1780 |
+
"creation_date":1752969600000.0,
|
| 1781 |
+
"tasks":[
|
| 1782 |
+
"translation_from",
|
| 1783 |
+
"translation_to",
|
| 1784 |
+
"classification",
|
| 1785 |
+
"mmlu",
|
| 1786 |
+
"arc",
|
| 1787 |
+
"truthfulqa",
|
| 1788 |
+
"mgsm"
|
| 1789 |
+
]
|
| 1790 |
}
|
| 1791 |
]
|
pyproject.toml
CHANGED
|
@@ -36,6 +36,9 @@ dev = [
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
[dependency-groups]
|
| 41 |
dev = [
|
|
@@ -44,3 +47,10 @@ dev = [
|
|
| 44 |
"scipy>=1.16.0",
|
| 45 |
"seaborn>=0.13.2",
|
| 46 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
| 39 |
+
cloud = [
|
| 40 |
+
"google-cloud-storage>=3.2.0",
|
| 41 |
+
]
|
| 42 |
|
| 43 |
[dependency-groups]
|
| 44 |
dev = [
|
|
|
|
| 47 |
"scipy>=1.16.0",
|
| 48 |
"seaborn>=0.13.2",
|
| 49 |
]
|
| 50 |
+
|
| 51 |
+
[build-system]
|
| 52 |
+
requires = ["hatchling"]
|
| 53 |
+
build-backend = "hatchling.build"
|
| 54 |
+
|
| 55 |
+
[tool.hatch.build.targets.wheel]
|
| 56 |
+
packages = ["evals"]
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
system_architecture_diagram.md
CHANGED
|
@@ -17,11 +17,15 @@ flowchart TD
|
|
| 17 |
G --> H["Enriched Model DataFrame"]
|
| 18 |
H --> |Save| I[models.json]
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
%% Language Data
|
| 21 |
J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
|
| 22 |
|
| 23 |
-
%% Task Registry
|
| 24 |
-
L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions"]
|
| 25 |
M --> M1["translation_from/to<br/>BLEU + ChrF"]
|
| 26 |
M --> M2["classification<br/>Accuracy"]
|
| 27 |
M --> M3["mmlu<br/>Accuracy"]
|
|
@@ -29,39 +33,47 @@ flowchart TD
|
|
| 29 |
M --> M5["truthfulqa<br/>Accuracy"]
|
| 30 |
M --> M6["mgsm<br/>Accuracy"]
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
%% Evaluation Pipeline
|
| 33 |
-
|
| 34 |
K --> |"languages bcp_47"| N
|
| 35 |
L --> |"tasks.items"| N
|
| 36 |
N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model Γ Language Γ Task"]
|
| 37 |
-
O --> |"10 samples each"| P["Evaluation Execution"]
|
| 38 |
-
|
| 39 |
-
%% Task Execution
|
| 40 |
-
P --> Q1[translate_and_evaluate]
|
| 41 |
-
P --> Q2[classify_and_evaluate]
|
| 42 |
-
P --> Q3[mmlu_and_evaluate]
|
| 43 |
-
P --> Q4[arc_and_evaluate]
|
| 44 |
-
P --> Q5[truthfulqa_and_evaluate]
|
| 45 |
-
P --> Q6[mgsm_and_evaluate]
|
| 46 |
-
|
| 47 |
-
%% API Calls
|
| 48 |
-
Q1 --> |"complete() API"| R["OpenRouter<br/>Model Inference"]
|
| 49 |
-
Q2 --> |"complete() API"| R
|
| 50 |
-
Q3 --> |"complete() API"| R
|
| 51 |
-
Q4 --> |"complete() API"| R
|
| 52 |
-
Q5 --> |"complete() API"| R
|
| 53 |
-
Q6 --> |"complete() API"| R
|
| 54 |
-
|
| 55 |
-
%% Results Processing
|
| 56 |
-
R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task"]
|
| 57 |
S --> |Save| T[results.json]
|
| 58 |
|
| 59 |
-
%% Backend & Frontend
|
| 60 |
T --> |Read| U[backend.py]
|
| 61 |
I --> |Read| U
|
| 62 |
-
U --> |make_model_table| V["Model Rankings"]
|
| 63 |
U --> |make_country_table| W["Country Aggregation"]
|
| 64 |
-
U --> |"API Endpoint"| X["FastAPI /api/data"]
|
| 65 |
X --> |"JSON Response"| Y["Frontend React App"]
|
| 66 |
|
| 67 |
%% UI Components
|
|
@@ -70,13 +82,13 @@ flowchart TD
|
|
| 70 |
Y --> Z3["LanguageTable.js<br/>Language Coverage"]
|
| 71 |
Y --> Z4["DatasetTable.js<br/>Task Performance"]
|
| 72 |
|
| 73 |
-
%% Data Sources
|
| 74 |
subgraph DS ["Data Sources"]
|
| 75 |
-
DS1["Flores-200<br/>Translation Sentences"]
|
| 76 |
-
DS2["MMLU/AfriMMLU<br/>Knowledge QA"]
|
| 77 |
-
DS3["ARC<br/>Science Reasoning"]
|
| 78 |
-
DS4["TruthfulQA<br/>Truthfulness"]
|
| 79 |
-
DS5["MGSM<br/>Math Problems"]
|
| 80 |
end
|
| 81 |
|
| 82 |
DS1 --> Q1
|
|
@@ -85,57 +97,79 @@ flowchart TD
|
|
| 85 |
DS4 --> Q5
|
| 86 |
DS5 --> Q6
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
class A1,A2,A3,A4 modelSource
|
| 96 |
class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
|
| 97 |
class R,F,G,X api
|
| 98 |
class T,I storage
|
| 99 |
class Y,Z1,Z2,Z3,Z4 frontend
|
|
|
|
| 100 |
```
|
| 101 |
|
| 102 |
## Architecture Components
|
| 103 |
|
| 104 |
-
### π΅ Model Discovery (
|
| 105 |
- **Static Curated Models**: Handpicked important models for comprehensive evaluation
|
| 106 |
- **Dynamic Popular Models**: Real-time discovery of trending models via web scraping
|
| 107 |
- **Quality Control**: Blocklist for problematic or incompatible models
|
|
|
|
|
|
|
| 108 |
- **Metadata Enrichment**: Rich model information from OpenRouter and HuggingFace APIs
|
| 109 |
|
| 110 |
-
### π£ Evaluation Pipeline (
|
| 111 |
- **7 Active Tasks**: Translation (bidirectional), Classification, MMLU, ARC, TruthfulQA, MGSM
|
|
|
|
|
|
|
| 112 |
- **Combinatorial Approach**: Systematic evaluation across Model Γ Language Γ Task combinations
|
| 113 |
- **Sample-based**: 10 evaluations per combination for statistical reliability
|
| 114 |
-
- **
|
|
|
|
| 115 |
|
| 116 |
-
### π API Integration (
|
| 117 |
- **OpenRouter**: Primary model inference API for all language model tasks
|
|
|
|
|
|
|
| 118 |
- **HuggingFace**: Model metadata and open-source model information
|
| 119 |
-
- **Google Translate**: Specialized translation API for
|
| 120 |
|
| 121 |
-
### π’ Data Storage (
|
| 122 |
-
- **results.json**: Aggregated evaluation scores
|
| 123 |
-
- **models.json**: Dynamic model list with metadata
|
| 124 |
- **languages.json**: Language information with population data
|
| 125 |
|
| 126 |
-
### π‘ Frontend Visualization (
|
| 127 |
- **WorldMap**: Interactive country-level language proficiency visualization
|
| 128 |
-
- **ModelTable**: Ranked model performance leaderboard
|
| 129 |
- **LanguageTable**: Language coverage and speaker statistics
|
| 130 |
-
- **DatasetTable**: Task-specific performance breakdowns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
## Data Flow Summary
|
| 133 |
|
| 134 |
-
1. **Model Discovery**: Combine curated + trending models β enrich with metadata
|
| 135 |
-
2. **Evaluation Setup**: Generate all valid Model Γ Language Γ Task combinations
|
| 136 |
-
3. **Task Execution**: Run evaluations using
|
| 137 |
-
4. **Result Processing**: Aggregate scores and save to JSON files
|
| 138 |
-
5. **Backend Serving**: FastAPI serves processed data via REST API
|
| 139 |
-
6. **Frontend Display**: React app visualizes data through interactive components
|
| 140 |
|
| 141 |
-
This architecture enables scalable, automated evaluation of AI language models across diverse languages and tasks while providing real-time insights through an intuitive web interface.
|
|
|
|
| 17 |
G --> H["Enriched Model DataFrame"]
|
| 18 |
H --> |Save| I[models.json]
|
| 19 |
|
| 20 |
+
%% Model Validation & Cost Filtering
|
| 21 |
+
H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost β€ $20/1M tokens"]
|
| 22 |
+
H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
|
| 23 |
+
|
| 24 |
%% Language Data
|
| 25 |
J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
|
| 26 |
|
| 27 |
+
%% Task Registry with Unified Prompting
|
| 28 |
+
L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
|
| 29 |
M --> M1["translation_from/to<br/>BLEU + ChrF"]
|
| 30 |
M --> M2["classification<br/>Accuracy"]
|
| 31 |
M --> M3["mmlu<br/>Accuracy"]
|
|
|
|
| 33 |
M --> M5["truthfulqa<br/>Accuracy"]
|
| 34 |
M --> M6["mgsm<br/>Accuracy"]
|
| 35 |
|
| 36 |
+
%% On-the-fly Translation with Origin Tagging
|
| 37 |
+
subgraph OTF [On-the-fly Dataset Translation]
|
| 38 |
+
direction LR
|
| 39 |
+
DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
|
| 40 |
+
Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
|
| 41 |
+
DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
|
| 42 |
+
end
|
| 43 |
+
|
| 44 |
%% Evaluation Pipeline
|
| 45 |
+
H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
|
| 46 |
K --> |"languages bcp_47"| N
|
| 47 |
L --> |"tasks.items"| N
|
| 48 |
N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model Γ Language Γ Task"]
|
| 49 |
+
O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
|
| 50 |
+
|
| 51 |
+
%% Task Execution with Origin Tracking
|
| 52 |
+
P --> Q1[translate_and_evaluate<br/>Origin: 'human']
|
| 53 |
+
P --> Q2[classify_and_evaluate<br/>Origin: 'human']
|
| 54 |
+
P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
|
| 55 |
+
P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
|
| 56 |
+
P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
|
| 57 |
+
P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
|
| 58 |
+
|
| 59 |
+
%% API Calls with Error Handling
|
| 60 |
+
Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
|
| 61 |
+
Q2 --> |"complete() API<br/>Rate Limiting"| R
|
| 62 |
+
Q3 --> |"complete() API<br/>Rate Limiting"| R
|
| 63 |
+
Q4 --> |"complete() API<br/>Rate Limiting"| R
|
| 64 |
+
Q5 --> |"complete() API<br/>Rate Limiting"| R
|
| 65 |
+
Q6 --> |"complete() API<br/>Rate Limiting"| R
|
| 66 |
+
|
| 67 |
+
%% Results Processing with Origin Aggregation
|
| 68 |
+
R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
|
| 69 |
S --> |Save| T[results.json]
|
| 70 |
|
| 71 |
+
%% Backend & Frontend with Origin-Specific Metrics
|
| 72 |
T --> |Read| U[backend.py]
|
| 73 |
I --> |Read| U
|
| 74 |
+
U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
|
| 75 |
U --> |make_country_table| W["Country Aggregation"]
|
| 76 |
+
U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
|
| 77 |
X --> |"JSON Response"| Y["Frontend React App"]
|
| 78 |
|
| 79 |
%% UI Components
|
|
|
|
| 82 |
Y --> Z3["LanguageTable.js<br/>Language Coverage"]
|
| 83 |
Y --> Z4["DatasetTable.js<br/>Task Performance"]
|
| 84 |
|
| 85 |
+
%% Data Sources with Origin Information
|
| 86 |
subgraph DS ["Data Sources"]
|
| 87 |
+
DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
|
| 88 |
+
DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
|
| 89 |
+
DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
|
| 90 |
+
DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
|
| 91 |
+
DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
|
| 92 |
end
|
| 93 |
|
| 94 |
DS1 --> Q1
|
|
|
|
| 97 |
DS4 --> Q5
|
| 98 |
DS5 --> Q6
|
| 99 |
|
| 100 |
+
DS_translated --> Q3
|
| 101 |
+
DS_translated --> Q4
|
| 102 |
+
DS_translated --> Q5
|
| 103 |
+
|
| 104 |
+
DS_native --> Q3
|
| 105 |
+
DS_native --> Q4
|
| 106 |
+
DS_native --> Q5
|
| 107 |
+
|
| 108 |
+
%% Styling - Neutral colors that work in both dark and light modes
|
| 109 |
+
classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
|
| 110 |
+
classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
|
| 111 |
+
classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
|
| 112 |
+
classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
|
| 113 |
+
classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
|
| 114 |
+
classDef translation fill:#d4edda,stroke:#155724,color:#155724
|
| 115 |
|
| 116 |
class A1,A2,A3,A4 modelSource
|
| 117 |
class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
|
| 118 |
class R,F,G,X api
|
| 119 |
class T,I storage
|
| 120 |
class Y,Z1,Z2,Z3,Z4 frontend
|
| 121 |
+
class Google_Translate,DS_translated,DS_native translation
|
| 122 |
```
|
| 123 |
|
| 124 |
## Architecture Components
|
| 125 |
|
| 126 |
+
### π΅ Model Discovery (Light Gray)
|
| 127 |
- **Static Curated Models**: Handpicked important models for comprehensive evaluation
|
| 128 |
- **Dynamic Popular Models**: Real-time discovery of trending models via web scraping
|
| 129 |
- **Quality Control**: Blocklist for problematic or incompatible models
|
| 130 |
+
- **Model Validation**: API availability checks and cost filtering (β€$20/1M tokens)
|
| 131 |
+
- **Timeout Protection**: 120s timeout for large/reasoning models, 60s for others
|
| 132 |
- **Metadata Enrichment**: Rich model information from OpenRouter and HuggingFace APIs
|
| 133 |
|
| 134 |
+
### π£ Evaluation Pipeline (Medium Gray)
|
| 135 |
- **7 Active Tasks**: Translation (bidirectional), Classification, MMLU, ARC, TruthfulQA, MGSM
|
| 136 |
+
- **Unified English Zero-Shot Prompting**: All tasks use English instructions with target language content
|
| 137 |
+
- **Origin Tagging**: Distinguishes between human-translated ('human') and machine-translated ('machine') data
|
| 138 |
- **Combinatorial Approach**: Systematic evaluation across Model Γ Language Γ Task combinations
|
| 139 |
- **Sample-based**: 10 evaluations per combination for statistical reliability
|
| 140 |
+
- **Batch Processing**: 50 tasks per batch with rate limiting and error resilience
|
| 141 |
+
- **Dual Deployment**: `main.py` for local/GitHub, `main_gcs.py` for Google Cloud with GCS storage
|
| 142 |
|
| 143 |
+
### π API Integration (Light Gray)
|
| 144 |
- **OpenRouter**: Primary model inference API for all language model tasks
|
| 145 |
+
- **Rate Limiting**: Intelligent batching and delays to prevent API overload
|
| 146 |
+
- **Error Handling**: Graceful handling of timeouts, rate limits, and model unavailability
|
| 147 |
- **HuggingFace**: Model metadata and open-source model information
|
| 148 |
+
- **Google Translate**: Specialized translation API for on-the-fly dataset translation
|
| 149 |
|
| 150 |
+
### π’ Data Storage (Cyan)
|
| 151 |
+
- **results.json**: Aggregated evaluation scores with origin-specific metrics
|
| 152 |
+
- **models.json**: Dynamic model list with metadata and validation status
|
| 153 |
- **languages.json**: Language information with population data
|
| 154 |
|
| 155 |
+
### π‘ Frontend Visualization (Light Red)
|
| 156 |
- **WorldMap**: Interactive country-level language proficiency visualization
|
| 157 |
+
- **ModelTable**: Ranked model performance leaderboard with origin-specific columns
|
| 158 |
- **LanguageTable**: Language coverage and speaker statistics
|
| 159 |
+
- **DatasetTable**: Task-specific performance breakdowns with human/machine distinction
|
| 160 |
+
|
| 161 |
+
### π΅ Translation & Origin Tracking (Light Green)
|
| 162 |
+
- **On-the-fly Translation**: Google Translate API for languages without native benchmarks
|
| 163 |
+
- **Origin Tagging**: Automatic classification of data sources (human vs. machine translated)
|
| 164 |
+
- **Separate Metrics**: Frontend displays distinct scores for human and machine-translated data
|
| 165 |
|
| 166 |
## Data Flow Summary
|
| 167 |
|
| 168 |
+
1. **Model Discovery**: Combine curated + trending models β validate API availability β enrich with metadata
|
| 169 |
+
2. **Evaluation Setup**: Generate all valid Model Γ Language Γ Task combinations with origin tracking
|
| 170 |
+
3. **Task Execution**: Run evaluations using unified English prompting and appropriate datasets
|
| 171 |
+
4. **Result Processing**: Aggregate scores by model+language+task+origin and save to JSON files
|
| 172 |
+
5. **Backend Serving**: FastAPI serves processed data with origin-specific metrics via REST API
|
| 173 |
+
6. **Frontend Display**: React app visualizes data through interactive components with transparency indicators
|
| 174 |
|
| 175 |
+
This architecture enables scalable, automated evaluation of AI language models across diverse languages and tasks while providing real-time insights through an intuitive web interface with methodological transparency.
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|