Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 27 days ago

Commit

19d1d68

verified ·

1 Parent(s): ea59941

Upload app.py

Browse files

Files changed (1) hide show

app.py +33 -8

app.py CHANGED Viewed

@@ -135,8 +135,15 @@ def validate_inputs(dataset_name: str, split: str, input_field: str, target_fiel
 def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
-                    model: str, input_field: str, target_field: str) -> Dict:
-    """Evaluate a prompt on a dataset using the selected model."""
     try:
         # Get API key from environment
         api_key = os.environ.get("OPENAI_API_KEY")
@@ -160,11 +167,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
             else:
                 raise
-        # Sample random examples
-        if len(dataset) > num_samples:
             indices = random.sample(range(len(dataset)), num_samples)
             samples = [dataset[i] for i in indices]
         else:
             samples = list(dataset)[:num_samples]
         # Initialize OpenAI client with OpenRouter
@@ -285,7 +299,8 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
             "accuracy": accuracy,
             "correct": correct,
             "total": total,
-            "results": results
         }
         # Add errors if any occurred
@@ -519,6 +534,9 @@ def evaluate(prompt: str) -> dict:
     Returns dict with combined_score (0-1), accuracy, correct, and total.
     """
     try:
         # Load dataset
         # Try loading with just dataset name first
         try:
@@ -780,18 +798,20 @@ Your improved prompt here
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
         "max_code_length": 40000,  # Allow long prompts (default 10000 is too short)
         "prompt": {
             "template_dir": templates_dir,  # Use our custom prompt engineering templates
         },
         "evolution": {
             "population_size": 10,
-            "num_islands": 1,
             "elite_ratio": 0.1,
             "explore_ratio": 0.3,
             "exploit_ratio": 0.6,
         },
         "database": {
             "log_prompts": True,  # Save prompts used to generate each program
         },
         "evaluator": {
             "timeout": 3600,  # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
@@ -844,6 +864,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         config_path = create_config_file(model, work_dir)
         # Run initial evaluation (using 20 samples to save API calls)
         progress(0.2, desc="Running initial evaluation on 20 samples...")
         initial_eval = evaluate_prompt(
             initial_prompt, dataset_name, dataset_split, 20,
@@ -856,6 +877,9 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         if initial_eval["total"] == 0:
             return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
         initial_results = f"""
 ### Initial Prompt Evaluation
@@ -938,10 +962,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                 else:
                     best_prompt = initial_prompt
-            # Evaluate best prompt (using 20 samples like initial eval for consistency)
             final_eval = evaluate_prompt(
                 best_prompt, dataset_name, dataset_split, 20,
-                model, input_field, target_field
             )
             final_results = f"""

 def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
+                    model: str, input_field: str, target_field: str,
+                    fixed_indices: List[int] = None) -> Dict:
+    """
+    Evaluate a prompt on a dataset using the selected model.
+    Args:
+        fixed_indices: Optional list of dataset indices to use. If provided,
+                      ensures we evaluate on the SAME samples every time.
+    """
     try:
         # Get API key from environment
         api_key = os.environ.get("OPENAI_API_KEY")
             else:
                 raise
+        # Sample examples - use fixed indices if provided to ensure consistency
+        if fixed_indices is not None:
+            # Use the provided indices (ensures same samples for initial/final eval)
+            indices = fixed_indices
+            samples = [dataset[i] for i in indices]
+        elif len(dataset) > num_samples:
+            # First time: use fixed seed for reproducible sampling
+            random.seed(42)  # Fixed seed ensures same samples across runs
             indices = random.sample(range(len(dataset)), num_samples)
             samples = [dataset[i] for i in indices]
         else:
+            indices = list(range(min(num_samples, len(dataset))))
             samples = list(dataset)[:num_samples]
         # Initialize OpenAI client with OpenRouter
             "accuracy": accuracy,
             "correct": correct,
             "total": total,
+            "results": results,
+            "indices": indices  # Return indices so we can reuse them for final eval
         }
         # Add errors if any occurred
     Returns dict with combined_score (0-1), accuracy, correct, and total.
     """
     try:
+        # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
+        random.seed(42)
         # Load dataset
         # Try loading with just dataset name first
         try:
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
         "max_code_length": 40000,  # Allow long prompts (default 10000 is too short)
+        "num_islands": 1,  # IMPORTANT: Use only 1 island (not 5) for simpler evolution
         "prompt": {
             "template_dir": templates_dir,  # Use our custom prompt engineering templates
         },
         "evolution": {
             "population_size": 10,
+            "num_islands": 1,  # Single island for simpler evolution
             "elite_ratio": 0.1,
             "explore_ratio": 0.3,
             "exploit_ratio": 0.6,
         },
         "database": {
             "log_prompts": True,  # Save prompts used to generate each program
+            "num_islands": 1,  # CRITICAL: This is where island count is actually read from!
         },
         "evaluator": {
             "timeout": 3600,  # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
         config_path = create_config_file(model, work_dir)
         # Run initial evaluation (using 20 samples to save API calls)
+        # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
         progress(0.2, desc="Running initial evaluation on 20 samples...")
         initial_eval = evaluate_prompt(
             initial_prompt, dataset_name, dataset_split, 20,
         if initial_eval["total"] == 0:
             return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
+        # Save the indices for final evaluation (ensures fair comparison)
+        eval_indices = initial_eval.get("indices", [])
         initial_results = f"""
 ### Initial Prompt Evaluation
                 else:
                     best_prompt = initial_prompt
+            # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
             final_eval = evaluate_prompt(
                 best_prompt, dataset_name, dataset_split, 20,
+                model, input_field, target_field,
+                fixed_indices=eval_indices  # Use same samples as initial eval!
             )
             final_results = f"""