codelion commited on
Commit
8b4f062
·
verified ·
1 Parent(s): 173eddc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -8
app.py CHANGED
@@ -15,11 +15,11 @@ import glob
15
  # Free models from OpenRouter - Curated selection (verified as of 2025)
16
  # IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
17
  FREE_MODELS = [
18
- "deepseek/deepseek-r1:free", # 671B (37B active) - Top-tier reasoning
19
- "qwen/qwen-2.5-72b-instruct:free", # 72B - Strong in coding/math/multilingual
20
  "meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
21
  "google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
22
  "mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
 
23
  ]
24
 
25
 
@@ -345,7 +345,7 @@ import random
345
  from datasets import load_dataset
346
  from openai import OpenAI
347
 
348
- def evaluate(prompt: str) -> float:
349
  """
350
  Evaluate a prompt using 2-stage cascading evaluation to save API calls.
351
 
@@ -356,7 +356,7 @@ def evaluate(prompt: str) -> float:
356
  Stage 2: Evaluate with 80 more samples (total 100)
357
  - Combine results for final score
358
 
359
- Returns score between 0 and 1.
360
  """
361
  try:
362
  # Load dataset
@@ -445,7 +445,13 @@ def evaluate(prompt: str) -> float:
445
  # Early exit if Stage 1 score is below threshold
446
  if stage1_score < 0.5:
447
  print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
448
- return stage1_score
 
 
 
 
 
 
449
 
450
  # STAGE 2: Continue with 80 more samples
451
  print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
@@ -466,14 +472,32 @@ def evaluate(prompt: str) -> float:
466
  final_score = (correct / total) if total > 0 else stage1_score
467
 
468
  print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
469
- return final_score
 
 
 
 
 
 
470
  else:
471
  print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
472
- return stage1_score
 
 
 
 
 
 
473
 
474
  except Exception as e:
475
  print(f"Error in evaluation: {{e}}")
476
- return 0.0
 
 
 
 
 
 
477
  '''
478
 
479
  evaluator_path = os.path.join(work_dir, "evaluator.py")
@@ -501,6 +525,7 @@ def create_config_file(model: str, work_dir: str):
501
  },
502
  "evaluation": {
503
  "timeout": None, # Disable timeout to avoid signal handling issues
 
504
  }
505
  }
506
 
 
15
  # Free models from OpenRouter - Curated selection (verified as of 2025)
16
  # IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
17
  FREE_MODELS = [
18
+ "qwen/qwen-2.5-72b-instruct:free", # 72B - Strong in coding/math/multilingual (default - better rate limits)
 
19
  "meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
20
  "google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
21
  "mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
22
+ "deepseek/deepseek-r1:free", # 671B (37B active) - Top-tier but heavily rate-limited
23
  ]
24
 
25
 
 
345
  from datasets import load_dataset
346
  from openai import OpenAI
347
 
348
+ def evaluate(prompt: str) -> dict:
349
  """
350
  Evaluate a prompt using 2-stage cascading evaluation to save API calls.
351
 
 
356
  Stage 2: Evaluate with 80 more samples (total 100)
357
  - Combine results for final score
358
 
359
+ Returns dict with combined_score (0-1), accuracy, correct, and total.
360
  """
361
  try:
362
  # Load dataset
 
445
  # Early exit if Stage 1 score is below threshold
446
  if stage1_score < 0.5:
447
  print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
448
+ return {{
449
+ "combined_score": stage1_score,
450
+ "accuracy": stage1_score,
451
+ "correct": correct,
452
+ "total": total,
453
+ "stage": "stage1_early_exit"
454
+ }}
455
 
456
  # STAGE 2: Continue with 80 more samples
457
  print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
 
472
  final_score = (correct / total) if total > 0 else stage1_score
473
 
474
  print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
475
+ return {{
476
+ "combined_score": final_score,
477
+ "accuracy": final_score,
478
+ "correct": correct,
479
+ "total": total,
480
+ "stage": "stage2_complete"
481
+ }}
482
  else:
483
  print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
484
+ return {{
485
+ "combined_score": stage1_score,
486
+ "accuracy": stage1_score,
487
+ "correct": correct,
488
+ "total": total,
489
+ "stage": "stage1_complete"
490
+ }}
491
 
492
  except Exception as e:
493
  print(f"Error in evaluation: {{e}}")
494
+ return {{
495
+ "combined_score": 0.0,
496
+ "accuracy": 0.0,
497
+ "correct": 0,
498
+ "total": 0,
499
+ "error": str(e)
500
+ }}
501
  '''
502
 
503
  evaluator_path = os.path.join(work_dir, "evaluator.py")
 
525
  },
526
  "evaluation": {
527
  "timeout": None, # Disable timeout to avoid signal handling issues
528
+ "cascade_evaluation": False, # Disable cascade to prevent signal errors
529
  }
530
  }
531