Spaces:
Running
Running
Upload from GitHub Actions: Merge pull request #10 from datenlabor-bmz/jn-dev
Browse files- .github/workflows/nightly-evals.yml +24 -2
- evals/main.py +119 -12
.github/workflows/nightly-evals.yml
CHANGED
|
@@ -8,6 +8,7 @@ on:
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
|
|
|
| 11 |
steps:
|
| 12 |
- uses: actions/checkout@v3
|
| 13 |
|
|
@@ -21,7 +22,7 @@ jobs:
|
|
| 21 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 22 |
uv sync --frozen --extra dev
|
| 23 |
|
| 24 |
-
- name: Run evaluations
|
| 25 |
env:
|
| 26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
@@ -30,7 +31,28 @@ jobs:
|
|
| 30 |
run: |
|
| 31 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 32 |
uv run evals/download_data.py
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
- name: Commit changes
|
| 36 |
env:
|
|
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
| 11 |
+
timeout-minutes: 1440 # 24 hours timeout
|
| 12 |
steps:
|
| 13 |
- uses: actions/checkout@v3
|
| 14 |
|
|
|
|
| 22 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 23 |
uv sync --frozen --extra dev
|
| 24 |
|
| 25 |
+
- name: Run evaluations with checkpointing
|
| 26 |
env:
|
| 27 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 28 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
| 31 |
run: |
|
| 32 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 33 |
uv run evals/download_data.py
|
| 34 |
+
|
| 35 |
+
# Run evaluations with periodic checkpointing
|
| 36 |
+
uv run python -c "
|
| 37 |
+
import time
|
| 38 |
+
import subprocess
|
| 39 |
+
import json
|
| 40 |
+
import os
|
| 41 |
+
|
| 42 |
+
# Check if we have existing results to resume from
|
| 43 |
+
if os.path.exists('results.json'):
|
| 44 |
+
print('Found existing results.json, will resume from checkpoint')
|
| 45 |
+
|
| 46 |
+
# Run the main evaluation
|
| 47 |
+
try:
|
| 48 |
+
subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
|
| 49 |
+
except subprocess.CalledProcessError as e:
|
| 50 |
+
print(f'Evaluation failed: {e}')
|
| 51 |
+
# Save current state even if failed
|
| 52 |
+
if os.path.exists('results.json'):
|
| 53 |
+
print('Saving checkpoint before exit...')
|
| 54 |
+
exit(1)
|
| 55 |
+
"
|
| 56 |
|
| 57 |
- name: Commit changes
|
| 58 |
env:
|
evals/main.py
CHANGED
|
@@ -11,6 +11,45 @@ import json
|
|
| 11 |
|
| 12 |
results = pd.DataFrame()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
async def evaluate():
|
| 15 |
# FIXME we should not need this for-loop, but it helps
|
| 16 |
n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
|
|
@@ -29,13 +68,30 @@ async def evaluate():
|
|
| 29 |
top_languages = languages.head(max_languages) # Top N by population
|
| 30 |
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# For testing, just use all available languages up to max_languages
|
| 33 |
for n_languages in [min(max_languages, len(top_languages))]:
|
| 34 |
print(f"running evaluations for {n_languages} languages")
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# get all combinations of model, language and task
|
| 40 |
combis = [
|
| 41 |
(model, lang.bcp_47, task_name)
|
|
@@ -60,9 +116,14 @@ async def evaluate():
|
|
| 60 |
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
| 61 |
all_results = []
|
| 62 |
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
batch = all_tasks[i:i+batch_size]
|
| 65 |
-
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Show what's being evaluated in this batch
|
| 68 |
batch_summary = {}
|
|
@@ -86,12 +147,57 @@ async def evaluate():
|
|
| 86 |
for task_data in batch:
|
| 87 |
task_func, model, bcp_47, sentence_nr = task_data
|
| 88 |
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Reduced delay between batches (optimized for GitHub Actions)
|
| 93 |
await asyncio.sleep(0.5)
|
| 94 |
|
|
|
|
| 95 |
results = all_results
|
| 96 |
# Filter out exceptions and flatten results
|
| 97 |
valid_results = []
|
|
@@ -108,7 +214,7 @@ async def evaluate():
|
|
| 108 |
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
| 109 |
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
| 110 |
|
| 111 |
-
# Save
|
| 112 |
if valid_results:
|
| 113 |
results = valid_results
|
| 114 |
args = dict(orient="records", indent=2, force_ascii=False)
|
|
@@ -124,6 +230,7 @@ async def evaluate():
|
|
| 124 |
# Merge with old results
|
| 125 |
old_results = pd.read_json("results.json")
|
| 126 |
results_df = pd.concat([old_results, results_df])
|
|
|
|
| 127 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 128 |
results_df.to_json("results.json", **args)
|
| 129 |
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
|
@@ -153,10 +260,10 @@ async def evaluate():
|
|
| 153 |
print(f"β
Full evaluation completed in {elapsed_str}")
|
| 154 |
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
|
| 161 |
return results
|
| 162 |
|
|
|
|
| 11 |
|
| 12 |
results = pd.DataFrame()
|
| 13 |
|
| 14 |
+
def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
|
| 15 |
+
"""Save current progress as checkpoint"""
|
| 16 |
+
try:
|
| 17 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 18 |
+
|
| 19 |
+
# Save current results
|
| 20 |
+
if len(results_df) > 0:
|
| 21 |
+
results_df.to_json("results.json", **args)
|
| 22 |
+
print(f"πΎ Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
|
| 23 |
+
|
| 24 |
+
# Save model and language info
|
| 25 |
+
models_df.to_json("models.json", **args)
|
| 26 |
+
languages_df.to_json("languages.json", **args)
|
| 27 |
+
|
| 28 |
+
# Save checkpoint metadata
|
| 29 |
+
checkpoint_info = {
|
| 30 |
+
"last_batch": batch_num,
|
| 31 |
+
"total_batches": total_batches,
|
| 32 |
+
"timestamp": datetime.now().isoformat(),
|
| 33 |
+
"results_count": len(results_df)
|
| 34 |
+
}
|
| 35 |
+
with open("checkpoint.json", "w") as f:
|
| 36 |
+
json.dump(checkpoint_info, f, indent=2)
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"β οΈ Failed to save checkpoint: {e}")
|
| 40 |
+
|
| 41 |
+
def load_checkpoint():
|
| 42 |
+
"""Load previous checkpoint if available"""
|
| 43 |
+
try:
|
| 44 |
+
if os.path.exists("checkpoint.json"):
|
| 45 |
+
with open("checkpoint.json", "r") as f:
|
| 46 |
+
checkpoint = json.load(f)
|
| 47 |
+
print(f"π Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
|
| 48 |
+
return checkpoint
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"β οΈ Failed to load checkpoint: {e}")
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
async def evaluate():
|
| 54 |
# FIXME we should not need this for-loop, but it helps
|
| 55 |
n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
|
|
|
|
| 68 |
top_languages = languages.head(max_languages) # Top N by population
|
| 69 |
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
| 70 |
|
| 71 |
+
# Load checkpoint if available
|
| 72 |
+
checkpoint = load_checkpoint()
|
| 73 |
+
start_batch = 0
|
| 74 |
+
if checkpoint:
|
| 75 |
+
start_batch = checkpoint['last_batch']
|
| 76 |
+
print(f"π Resuming from batch {start_batch}")
|
| 77 |
+
|
| 78 |
# For testing, just use all available languages up to max_languages
|
| 79 |
for n_languages in [min(max_languages, len(top_languages))]:
|
| 80 |
print(f"running evaluations for {n_languages} languages")
|
| 81 |
+
|
| 82 |
+
# Load existing results
|
| 83 |
+
try:
|
| 84 |
+
old_results = pd.read_json("results.json")
|
| 85 |
+
if old_results.empty:
|
| 86 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 87 |
+
except FileNotFoundError:
|
| 88 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
old_models = pd.read_json("models.json")
|
| 92 |
+
except FileNotFoundError:
|
| 93 |
+
old_models = pd.DataFrame()
|
| 94 |
+
|
| 95 |
# get all combinations of model, language and task
|
| 96 |
combis = [
|
| 97 |
(model, lang.bcp_47, task_name)
|
|
|
|
| 116 |
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
| 117 |
all_results = []
|
| 118 |
|
| 119 |
+
# Calculate total batches for progress tracking
|
| 120 |
+
total_batches = (len(all_tasks) + batch_size - 1) // batch_size
|
| 121 |
+
|
| 122 |
+
for i in range(start_batch * batch_size, len(all_tasks), batch_size):
|
| 123 |
batch = all_tasks[i:i+batch_size]
|
| 124 |
+
current_batch = i // batch_size + 1
|
| 125 |
+
|
| 126 |
+
print(f"π¦ Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
|
| 127 |
|
| 128 |
# Show what's being evaluated in this batch
|
| 129 |
batch_summary = {}
|
|
|
|
| 147 |
for task_data in batch:
|
| 148 |
task_func, model, bcp_47, sentence_nr = task_data
|
| 149 |
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
| 153 |
+
all_results.extend(batch_results)
|
| 154 |
+
|
| 155 |
+
# Save checkpoint after each batch
|
| 156 |
+
valid_results = []
|
| 157 |
+
exception_count = 0
|
| 158 |
+
for r in batch_results:
|
| 159 |
+
if isinstance(r, Exception):
|
| 160 |
+
exception_count += 1
|
| 161 |
+
continue
|
| 162 |
+
if isinstance(r, list):
|
| 163 |
+
valid_results.extend(r)
|
| 164 |
+
else:
|
| 165 |
+
valid_results.append(r)
|
| 166 |
+
|
| 167 |
+
if valid_results:
|
| 168 |
+
# Aggregate results
|
| 169 |
+
batch_df = pd.DataFrame(valid_results)
|
| 170 |
+
if len(batch_df) > 0:
|
| 171 |
+
batch_df = (
|
| 172 |
+
batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 173 |
+
.agg({"score": "mean"})
|
| 174 |
+
.reset_index()
|
| 175 |
+
)
|
| 176 |
+
# Merge with existing results
|
| 177 |
+
all_results_df = pd.concat([old_results, batch_df])
|
| 178 |
+
all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 179 |
+
all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 180 |
+
|
| 181 |
+
# Save checkpoint
|
| 182 |
+
save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
|
| 183 |
+
|
| 184 |
+
# Update old_results for next batch
|
| 185 |
+
old_results = all_results_df
|
| 186 |
+
|
| 187 |
+
print(f"β
Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
print(f"β Batch {current_batch} failed: {e}")
|
| 191 |
+
# Save checkpoint even on failure
|
| 192 |
+
if len(all_results) > 0:
|
| 193 |
+
results_df = pd.DataFrame(all_results)
|
| 194 |
+
save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
|
| 195 |
+
continue
|
| 196 |
|
| 197 |
# Reduced delay between batches (optimized for GitHub Actions)
|
| 198 |
await asyncio.sleep(0.5)
|
| 199 |
|
| 200 |
+
# Final aggregation and save
|
| 201 |
results = all_results
|
| 202 |
# Filter out exceptions and flatten results
|
| 203 |
valid_results = []
|
|
|
|
| 214 |
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
| 215 |
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
| 216 |
|
| 217 |
+
# Save final results
|
| 218 |
if valid_results:
|
| 219 |
results = valid_results
|
| 220 |
args = dict(orient="records", indent=2, force_ascii=False)
|
|
|
|
| 230 |
# Merge with old results
|
| 231 |
old_results = pd.read_json("results.json")
|
| 232 |
results_df = pd.concat([old_results, results_df])
|
| 233 |
+
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 234 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 235 |
results_df.to_json("results.json", **args)
|
| 236 |
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
|
|
|
| 260 |
print(f"β
Full evaluation completed in {elapsed_str}")
|
| 261 |
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 262 |
|
| 263 |
+
# Clean up checkpoint file on successful completion
|
| 264 |
+
if os.path.exists("checkpoint.json"):
|
| 265 |
+
os.remove("checkpoint.json")
|
| 266 |
+
print("π§Ή Cleaned up checkpoint file")
|
| 267 |
|
| 268 |
return results
|
| 269 |
|