Upload app.py
Browse files
app.py
CHANGED
|
@@ -15,11 +15,11 @@ import glob
|
|
| 15 |
# Free models from OpenRouter - Curated selection (verified as of 2025)
|
| 16 |
# IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
|
| 17 |
FREE_MODELS = [
|
| 18 |
-
"
|
| 19 |
-
"qwen/qwen-2.5-72b-instruct:free", # 72B - Strong in coding/math/multilingual
|
| 20 |
"meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
|
| 21 |
"google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
|
| 22 |
"mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
|
|
|
|
| 23 |
]
|
| 24 |
|
| 25 |
|
|
@@ -345,7 +345,7 @@ import random
|
|
| 345 |
from datasets import load_dataset
|
| 346 |
from openai import OpenAI
|
| 347 |
|
| 348 |
-
def evaluate(prompt: str) ->
|
| 349 |
"""
|
| 350 |
Evaluate a prompt using 2-stage cascading evaluation to save API calls.
|
| 351 |
|
|
@@ -356,7 +356,7 @@ def evaluate(prompt: str) -> float:
|
|
| 356 |
Stage 2: Evaluate with 80 more samples (total 100)
|
| 357 |
- Combine results for final score
|
| 358 |
|
| 359 |
-
Returns
|
| 360 |
"""
|
| 361 |
try:
|
| 362 |
# Load dataset
|
|
@@ -445,7 +445,13 @@ def evaluate(prompt: str) -> float:
|
|
| 445 |
# Early exit if Stage 1 score is below threshold
|
| 446 |
if stage1_score < 0.5:
|
| 447 |
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
|
| 448 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
# STAGE 2: Continue with 80 more samples
|
| 451 |
print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
|
|
@@ -466,14 +472,32 @@ def evaluate(prompt: str) -> float:
|
|
| 466 |
final_score = (correct / total) if total > 0 else stage1_score
|
| 467 |
|
| 468 |
print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
|
| 469 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
else:
|
| 471 |
print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
|
| 472 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
except Exception as e:
|
| 475 |
print(f"Error in evaluation: {{e}}")
|
| 476 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
'''
|
| 478 |
|
| 479 |
evaluator_path = os.path.join(work_dir, "evaluator.py")
|
|
@@ -501,6 +525,7 @@ def create_config_file(model: str, work_dir: str):
|
|
| 501 |
},
|
| 502 |
"evaluation": {
|
| 503 |
"timeout": None, # Disable timeout to avoid signal handling issues
|
|
|
|
| 504 |
}
|
| 505 |
}
|
| 506 |
|
|
|
|
| 15 |
# Free models from OpenRouter - Curated selection (verified as of 2025)
|
| 16 |
# IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
|
| 17 |
FREE_MODELS = [
|
| 18 |
+
"qwen/qwen-2.5-72b-instruct:free", # 72B - Strong in coding/math/multilingual (default - better rate limits)
|
|
|
|
| 19 |
"meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
|
| 20 |
"google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
|
| 21 |
"mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
|
| 22 |
+
"deepseek/deepseek-r1:free", # 671B (37B active) - Top-tier but heavily rate-limited
|
| 23 |
]
|
| 24 |
|
| 25 |
|
|
|
|
| 345 |
from datasets import load_dataset
|
| 346 |
from openai import OpenAI
|
| 347 |
|
| 348 |
+
def evaluate(prompt: str) -> dict:
|
| 349 |
"""
|
| 350 |
Evaluate a prompt using 2-stage cascading evaluation to save API calls.
|
| 351 |
|
|
|
|
| 356 |
Stage 2: Evaluate with 80 more samples (total 100)
|
| 357 |
- Combine results for final score
|
| 358 |
|
| 359 |
+
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
| 360 |
"""
|
| 361 |
try:
|
| 362 |
# Load dataset
|
|
|
|
| 445 |
# Early exit if Stage 1 score is below threshold
|
| 446 |
if stage1_score < 0.5:
|
| 447 |
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
|
| 448 |
+
return {{
|
| 449 |
+
"combined_score": stage1_score,
|
| 450 |
+
"accuracy": stage1_score,
|
| 451 |
+
"correct": correct,
|
| 452 |
+
"total": total,
|
| 453 |
+
"stage": "stage1_early_exit"
|
| 454 |
+
}}
|
| 455 |
|
| 456 |
# STAGE 2: Continue with 80 more samples
|
| 457 |
print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
|
|
|
|
| 472 |
final_score = (correct / total) if total > 0 else stage1_score
|
| 473 |
|
| 474 |
print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
|
| 475 |
+
return {{
|
| 476 |
+
"combined_score": final_score,
|
| 477 |
+
"accuracy": final_score,
|
| 478 |
+
"correct": correct,
|
| 479 |
+
"total": total,
|
| 480 |
+
"stage": "stage2_complete"
|
| 481 |
+
}}
|
| 482 |
else:
|
| 483 |
print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
|
| 484 |
+
return {{
|
| 485 |
+
"combined_score": stage1_score,
|
| 486 |
+
"accuracy": stage1_score,
|
| 487 |
+
"correct": correct,
|
| 488 |
+
"total": total,
|
| 489 |
+
"stage": "stage1_complete"
|
| 490 |
+
}}
|
| 491 |
|
| 492 |
except Exception as e:
|
| 493 |
print(f"Error in evaluation: {{e}}")
|
| 494 |
+
return {{
|
| 495 |
+
"combined_score": 0.0,
|
| 496 |
+
"accuracy": 0.0,
|
| 497 |
+
"correct": 0,
|
| 498 |
+
"total": 0,
|
| 499 |
+
"error": str(e)
|
| 500 |
+
}}
|
| 501 |
'''
|
| 502 |
|
| 503 |
evaluator_path = os.path.join(work_dir, "evaluator.py")
|
|
|
|
| 525 |
},
|
| 526 |
"evaluation": {
|
| 527 |
"timeout": None, # Disable timeout to avoid signal handling issues
|
| 528 |
+
"cascade_evaluation": False, # Disable cascade to prevent signal errors
|
| 529 |
}
|
| 530 |
}
|
| 531 |
|