codelion commited on
Commit
7308434
Β·
verified Β·
1 Parent(s): f5714df

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -511,11 +511,11 @@ def parse_evolution_history(output_dir: str) -> str:
511
  # If no specific files found, show directory contents
512
  if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
513
  evolution_viz += "### Evolution Complete\n\n"
514
- evolution_viz += "OpenEvolve ran 10 iterations of evolutionary optimization using:\n"
515
  evolution_viz += "- **Population Size**: 10 prompts per generation\n"
516
  evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
517
  evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
518
- evolution_viz += "- **Evaluation**: 100 samples per prompt variant\n\n"
519
 
520
  # Count files in output directory
521
  all_files = os.listdir(output_dir)
@@ -538,10 +538,10 @@ from openai import OpenAI
538
 
539
  def evaluate(prompt: str) -> dict:
540
  """
541
- Evaluate a prompt using 200 fixed samples (same as initial/final eval).
542
 
543
  This ensures evolution optimizes for the SAME test set we measure on.
544
- No staging - always evaluates all 200 samples for consistency.
545
  """
546
  try:
547
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
@@ -559,8 +559,8 @@ def evaluate(prompt: str) -> dict:
559
  else:
560
  raise
561
 
562
- # Sample 200 samples with seed 42 (SAME as initial/final eval)
563
- num_samples = 200
564
  if len(dataset) > num_samples:
565
  # Use SAME sampling logic as initial/final eval
566
  indices = random.sample(range(len(dataset)), num_samples)
@@ -765,8 +765,8 @@ Your improved prompt here
765
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
766
  "temperature": 0.7,
767
  },
768
- "max_iterations": 10,
769
- "checkpoint_interval": 2, # Save checkpoints every 2 iterations to preserve prompt history
770
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
771
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
772
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
@@ -835,11 +835,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
835
  progress(0.15, desc="Creating configuration...")
836
  config_path = create_config_file(model, work_dir)
837
 
838
- # Run initial evaluation with 200 samples
839
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
840
- progress(0.2, desc="Running initial evaluation on 200 samples...")
841
  initial_eval = evaluate_prompt(
842
- initial_prompt, dataset_name, dataset_split, 200,
843
  model, input_field, target_field
844
  )
845
 
@@ -873,7 +873,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
873
  initial_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
874
 
875
  # Run OpenEvolve
876
- progress(0.3, desc="Starting OpenEvolve optimization (10 iterations with staged evaluation)...")
877
 
878
  output_dir = os.path.join(work_dir, "output")
879
  os.makedirs(output_dir, exist_ok=True)
@@ -934,12 +934,12 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
934
  else:
935
  best_prompt = initial_prompt
936
 
937
- # Final evaluation: Use same 200 samples as initial eval for fair comparison
938
- progress(0.85, desc="Evaluating best prompt on 200 samples (same as initial)...")
939
  final_eval = evaluate_prompt(
940
- best_prompt, dataset_name, dataset_split, 200,
941
  model, input_field, target_field,
942
- fixed_indices=eval_indices # Use same 200 samples as initial eval!
943
  )
944
 
945
  progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
@@ -952,6 +952,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
952
  {best_prompt}
953
  ```
954
 
 
 
 
 
955
  **Results:**
956
  - Accuracy: {final_eval['accuracy']:.2f}%
957
  - Correct: {final_eval['correct']}/{final_eval['total']}
@@ -971,10 +975,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
971
  ### Summary
972
  - **Dataset**: {dataset_name} ({dataset_split} split)
973
  - **Model**: {model}
974
- - **Initial Eval**: 200 samples
975
- - **Final Eval**: 200 samples (same samples for fair comparison)
976
- - **Evolution**: 200 samples per variant (same samples as initial/final)
977
- - **Iterations**: 10
978
 
979
  ### Results
980
  - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
 
511
  # If no specific files found, show directory contents
512
  if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
513
  evolution_viz += "### Evolution Complete\n\n"
514
+ evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
515
  evolution_viz += "- **Population Size**: 10 prompts per generation\n"
516
  evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
517
  evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
518
+ evolution_viz += "- **Evaluation**: 50 samples per prompt variant\n\n"
519
 
520
  # Count files in output directory
521
  all_files = os.listdir(output_dir)
 
538
 
539
  def evaluate(prompt: str) -> dict:
540
  """
541
+ Evaluate a prompt using 50 fixed samples (same as initial/final eval).
542
 
543
  This ensures evolution optimizes for the SAME test set we measure on.
544
+ No staging - always evaluates all 50 samples for consistency.
545
  """
546
  try:
547
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
 
559
  else:
560
  raise
561
 
562
+ # Sample 50 samples with seed 42 (SAME as initial/final eval)
563
+ num_samples = 50
564
  if len(dataset) > num_samples:
565
  # Use SAME sampling logic as initial/final eval
566
  indices = random.sample(range(len(dataset)), num_samples)
 
765
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
766
  "temperature": 0.7,
767
  },
768
+ "max_iterations": 5,
769
+ "checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
770
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
771
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
772
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
 
835
  progress(0.15, desc="Creating configuration...")
836
  config_path = create_config_file(model, work_dir)
837
 
838
+ # Run initial evaluation with 50 samples
839
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
840
+ progress(0.2, desc="Running initial evaluation on 50 samples...")
841
  initial_eval = evaluate_prompt(
842
+ initial_prompt, dataset_name, dataset_split, 50,
843
  model, input_field, target_field
844
  )
845
 
 
873
  initial_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
874
 
875
  # Run OpenEvolve
876
+ progress(0.3, desc="Starting OpenEvolve optimization (5 iterations)...")
877
 
878
  output_dir = os.path.join(work_dir, "output")
879
  os.makedirs(output_dir, exist_ok=True)
 
934
  else:
935
  best_prompt = initial_prompt
936
 
937
+ # Final evaluation: Use same 50 samples as initial eval for fair comparison
938
+ progress(0.85, desc="Evaluating best prompt on 50 samples (same as initial)...")
939
  final_eval = evaluate_prompt(
940
+ best_prompt, dataset_name, dataset_split, 50,
941
  model, input_field, target_field,
942
+ fixed_indices=eval_indices # Use same 50 samples as initial eval!
943
  )
944
 
945
  progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
 
952
  {best_prompt}
953
  ```
954
 
955
+ **Validation:**
956
+ - Contains {{input}} placeholder: {'βœ“ Yes' if '{input}' in best_prompt else '❌ NO - This will break evaluation!'}
957
+ - Prompt length: {len(best_prompt)} characters
958
+
959
  **Results:**
960
  - Accuracy: {final_eval['accuracy']:.2f}%
961
  - Correct: {final_eval['correct']}/{final_eval['total']}
 
975
  ### Summary
976
  - **Dataset**: {dataset_name} ({dataset_split} split)
977
  - **Model**: {model}
978
+ - **Initial Eval**: 50 samples
979
+ - **Final Eval**: 50 samples (same samples for fair comparison)
980
+ - **Evolution**: 50 samples per variant (same samples as initial/final)
981
+ - **Iterations**: 5
982
 
983
  ### Results
984
  - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})