Upload app.py
Browse files
app.py
CHANGED
|
@@ -511,11 +511,11 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 511 |
# If no specific files found, show directory contents
|
| 512 |
if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
|
| 513 |
evolution_viz += "### Evolution Complete\n\n"
|
| 514 |
-
evolution_viz += "OpenEvolve ran
|
| 515 |
evolution_viz += "- **Population Size**: 10 prompts per generation\n"
|
| 516 |
evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
|
| 517 |
evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
|
| 518 |
-
evolution_viz += "- **Evaluation**:
|
| 519 |
|
| 520 |
# Count files in output directory
|
| 521 |
all_files = os.listdir(output_dir)
|
|
@@ -538,10 +538,10 @@ from openai import OpenAI
|
|
| 538 |
|
| 539 |
def evaluate(prompt: str) -> dict:
|
| 540 |
"""
|
| 541 |
-
Evaluate a prompt using
|
| 542 |
|
| 543 |
This ensures evolution optimizes for the SAME test set we measure on.
|
| 544 |
-
No staging - always evaluates all
|
| 545 |
"""
|
| 546 |
try:
|
| 547 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
@@ -559,8 +559,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 559 |
else:
|
| 560 |
raise
|
| 561 |
|
| 562 |
-
# Sample
|
| 563 |
-
num_samples =
|
| 564 |
if len(dataset) > num_samples:
|
| 565 |
# Use SAME sampling logic as initial/final eval
|
| 566 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
@@ -765,8 +765,8 @@ Your improved prompt here
|
|
| 765 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 766 |
"temperature": 0.7,
|
| 767 |
},
|
| 768 |
-
"max_iterations":
|
| 769 |
-
"checkpoint_interval":
|
| 770 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 771 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 772 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
|
@@ -835,11 +835,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 835 |
progress(0.15, desc="Creating configuration...")
|
| 836 |
config_path = create_config_file(model, work_dir)
|
| 837 |
|
| 838 |
-
# Run initial evaluation with
|
| 839 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 840 |
-
progress(0.2, desc="Running initial evaluation on
|
| 841 |
initial_eval = evaluate_prompt(
|
| 842 |
-
initial_prompt, dataset_name, dataset_split,
|
| 843 |
model, input_field, target_field
|
| 844 |
)
|
| 845 |
|
|
@@ -873,7 +873,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 873 |
initial_results += f" β Correct\n" if result['correct'] else f" β Incorrect\n"
|
| 874 |
|
| 875 |
# Run OpenEvolve
|
| 876 |
-
progress(0.3, desc="Starting OpenEvolve optimization (
|
| 877 |
|
| 878 |
output_dir = os.path.join(work_dir, "output")
|
| 879 |
os.makedirs(output_dir, exist_ok=True)
|
|
@@ -934,12 +934,12 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 934 |
else:
|
| 935 |
best_prompt = initial_prompt
|
| 936 |
|
| 937 |
-
# Final evaluation: Use same
|
| 938 |
-
progress(0.85, desc="Evaluating best prompt on
|
| 939 |
final_eval = evaluate_prompt(
|
| 940 |
-
best_prompt, dataset_name, dataset_split,
|
| 941 |
model, input_field, target_field,
|
| 942 |
-
fixed_indices=eval_indices # Use same
|
| 943 |
)
|
| 944 |
|
| 945 |
progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
|
|
@@ -952,6 +952,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 952 |
{best_prompt}
|
| 953 |
```
|
| 954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
**Results:**
|
| 956 |
- Accuracy: {final_eval['accuracy']:.2f}%
|
| 957 |
- Correct: {final_eval['correct']}/{final_eval['total']}
|
|
@@ -971,10 +975,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 971 |
### Summary
|
| 972 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 973 |
- **Model**: {model}
|
| 974 |
-
- **Initial Eval**:
|
| 975 |
-
- **Final Eval**:
|
| 976 |
-
- **Evolution**:
|
| 977 |
-
- **Iterations**:
|
| 978 |
|
| 979 |
### Results
|
| 980 |
- **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
|
|
|
|
| 511 |
# If no specific files found, show directory contents
|
| 512 |
if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
|
| 513 |
evolution_viz += "### Evolution Complete\n\n"
|
| 514 |
+
evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
|
| 515 |
evolution_viz += "- **Population Size**: 10 prompts per generation\n"
|
| 516 |
evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
|
| 517 |
evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
|
| 518 |
+
evolution_viz += "- **Evaluation**: 50 samples per prompt variant\n\n"
|
| 519 |
|
| 520 |
# Count files in output directory
|
| 521 |
all_files = os.listdir(output_dir)
|
|
|
|
| 538 |
|
| 539 |
def evaluate(prompt: str) -> dict:
|
| 540 |
"""
|
| 541 |
+
Evaluate a prompt using 50 fixed samples (same as initial/final eval).
|
| 542 |
|
| 543 |
This ensures evolution optimizes for the SAME test set we measure on.
|
| 544 |
+
No staging - always evaluates all 50 samples for consistency.
|
| 545 |
"""
|
| 546 |
try:
|
| 547 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
|
|
| 559 |
else:
|
| 560 |
raise
|
| 561 |
|
| 562 |
+
# Sample 50 samples with seed 42 (SAME as initial/final eval)
|
| 563 |
+
num_samples = 50
|
| 564 |
if len(dataset) > num_samples:
|
| 565 |
# Use SAME sampling logic as initial/final eval
|
| 566 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
|
|
| 765 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 766 |
"temperature": 0.7,
|
| 767 |
},
|
| 768 |
+
"max_iterations": 5,
|
| 769 |
+
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
| 770 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 771 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 772 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
|
|
|
| 835 |
progress(0.15, desc="Creating configuration...")
|
| 836 |
config_path = create_config_file(model, work_dir)
|
| 837 |
|
| 838 |
+
# Run initial evaluation with 50 samples
|
| 839 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 840 |
+
progress(0.2, desc="Running initial evaluation on 50 samples...")
|
| 841 |
initial_eval = evaluate_prompt(
|
| 842 |
+
initial_prompt, dataset_name, dataset_split, 50,
|
| 843 |
model, input_field, target_field
|
| 844 |
)
|
| 845 |
|
|
|
|
| 873 |
initial_results += f" β Correct\n" if result['correct'] else f" β Incorrect\n"
|
| 874 |
|
| 875 |
# Run OpenEvolve
|
| 876 |
+
progress(0.3, desc="Starting OpenEvolve optimization (5 iterations)...")
|
| 877 |
|
| 878 |
output_dir = os.path.join(work_dir, "output")
|
| 879 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
| 934 |
else:
|
| 935 |
best_prompt = initial_prompt
|
| 936 |
|
| 937 |
+
# Final evaluation: Use same 50 samples as initial eval for fair comparison
|
| 938 |
+
progress(0.85, desc="Evaluating best prompt on 50 samples (same as initial)...")
|
| 939 |
final_eval = evaluate_prompt(
|
| 940 |
+
best_prompt, dataset_name, dataset_split, 50,
|
| 941 |
model, input_field, target_field,
|
| 942 |
+
fixed_indices=eval_indices # Use same 50 samples as initial eval!
|
| 943 |
)
|
| 944 |
|
| 945 |
progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
|
|
|
|
| 952 |
{best_prompt}
|
| 953 |
```
|
| 954 |
|
| 955 |
+
**Validation:**
|
| 956 |
+
- Contains {{input}} placeholder: {'β Yes' if '{input}' in best_prompt else 'β NO - This will break evaluation!'}
|
| 957 |
+
- Prompt length: {len(best_prompt)} characters
|
| 958 |
+
|
| 959 |
**Results:**
|
| 960 |
- Accuracy: {final_eval['accuracy']:.2f}%
|
| 961 |
- Correct: {final_eval['correct']}/{final_eval['total']}
|
|
|
|
| 975 |
### Summary
|
| 976 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 977 |
- **Model**: {model}
|
| 978 |
+
- **Initial Eval**: 50 samples
|
| 979 |
+
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 980 |
+
- **Evolution**: 50 samples per variant (same samples as initial/final)
|
| 981 |
+
- **Iterations**: 5
|
| 982 |
|
| 983 |
### Results
|
| 984 |
- **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
|