Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 27 days ago

Commit

8b4f062

verified ·

1 Parent(s): 173eddc

Upload app.py

Browse files

Files changed (1) hide show

app.py +33 -8

app.py CHANGED Viewed

@@ -15,11 +15,11 @@ import glob
 # Free models from OpenRouter - Curated selection (verified as of 2025)
 # IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
 FREE_MODELS = [
-    "deepseek/deepseek-r1:free",  # 671B (37B active) - Top-tier reasoning
-    "qwen/qwen-2.5-72b-instruct:free",  # 72B - Strong in coding/math/multilingual
     "meta-llama/llama-3.3-70b-instruct:free",  # 70B - Advanced reasoning
     "google/gemma-3-27b-it:free",  # 27B - Strong instruction-tuned
     "mistralai/mistral-small-3.1-24b-instruct:free",  # 24B - Efficient and capable
 ]
@@ -345,7 +345,7 @@ import random
 from datasets import load_dataset
 from openai import OpenAI
-def evaluate(prompt: str) -> float:
     """
     Evaluate a prompt using 2-stage cascading evaluation to save API calls.
@@ -356,7 +356,7 @@ def evaluate(prompt: str) -> float:
     Stage 2: Evaluate with 80 more samples (total 100)
     - Combine results for final score
-    Returns score between 0 and 1.
     """
     try:
         # Load dataset
@@ -445,7 +445,13 @@ def evaluate(prompt: str) -> float:
         # Early exit if Stage 1 score is below threshold
         if stage1_score < 0.5:
             print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
-            return stage1_score
         # STAGE 2: Continue with 80 more samples
         print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
@@ -466,14 +472,32 @@ def evaluate(prompt: str) -> float:
             final_score = (correct / total) if total > 0 else stage1_score
             print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
-            return final_score
         else:
             print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
-            return stage1_score
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
-        return 0.0
 '''
     evaluator_path = os.path.join(work_dir, "evaluator.py")
@@ -501,6 +525,7 @@ def create_config_file(model: str, work_dir: str):
         },
         "evaluation": {
             "timeout": None,  # Disable timeout to avoid signal handling issues
         }
     }

 # Free models from OpenRouter - Curated selection (verified as of 2025)
 # IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
 FREE_MODELS = [
+    "qwen/qwen-2.5-72b-instruct:free",  # 72B - Strong in coding/math/multilingual (default - better rate limits)
     "meta-llama/llama-3.3-70b-instruct:free",  # 70B - Advanced reasoning
     "google/gemma-3-27b-it:free",  # 27B - Strong instruction-tuned
     "mistralai/mistral-small-3.1-24b-instruct:free",  # 24B - Efficient and capable
+    "deepseek/deepseek-r1:free",  # 671B (37B active) - Top-tier but heavily rate-limited
 ]
 from datasets import load_dataset
 from openai import OpenAI
+def evaluate(prompt: str) -> dict:
     """
     Evaluate a prompt using 2-stage cascading evaluation to save API calls.
     Stage 2: Evaluate with 80 more samples (total 100)
     - Combine results for final score
+    Returns dict with combined_score (0-1), accuracy, correct, and total.
     """
     try:
         # Load dataset
         # Early exit if Stage 1 score is below threshold
         if stage1_score < 0.5:
             print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
+            return {{
+                "combined_score": stage1_score,
+                "accuracy": stage1_score,
+                "correct": correct,
+                "total": total,
+                "stage": "stage1_early_exit"
+            }}
         # STAGE 2: Continue with 80 more samples
         print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
             final_score = (correct / total) if total > 0 else stage1_score
             print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
+            return {{
+                "combined_score": final_score,
+                "accuracy": final_score,
+                "correct": correct,
+                "total": total,
+                "stage": "stage2_complete"
+            }}
         else:
             print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
+            return {{
+                "combined_score": stage1_score,
+                "accuracy": stage1_score,
+                "correct": correct,
+                "total": total,
+                "stage": "stage1_complete"
+            }}
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
+        return {{
+            "combined_score": 0.0,
+            "accuracy": 0.0,
+            "correct": 0,
+            "total": 0,
+            "error": str(e)
+        }}
 '''
     evaluator_path = os.path.join(work_dir, "evaluator.py")
         },
         "evaluation": {
             "timeout": None,  # Disable timeout to avoid signal handling issues
+            "cascade_evaluation": False,  # Disable cascade to prevent signal errors
         }
     }