Upload app.py
Browse files
app.py
CHANGED
|
@@ -135,8 +135,15 @@ def validate_inputs(dataset_name: str, split: str, input_field: str, target_fiel
|
|
| 135 |
|
| 136 |
|
| 137 |
def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
|
| 138 |
-
model: str, input_field: str, target_field: str
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
try:
|
| 141 |
# Get API key from environment
|
| 142 |
api_key = os.environ.get("OPENAI_API_KEY")
|
|
@@ -160,11 +167,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 160 |
else:
|
| 161 |
raise
|
| 162 |
|
| 163 |
-
# Sample
|
| 164 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
indices = random.sample(range(len(dataset)), num_samples)
|
| 166 |
samples = [dataset[i] for i in indices]
|
| 167 |
else:
|
|
|
|
| 168 |
samples = list(dataset)[:num_samples]
|
| 169 |
|
| 170 |
# Initialize OpenAI client with OpenRouter
|
|
@@ -285,7 +299,8 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 285 |
"accuracy": accuracy,
|
| 286 |
"correct": correct,
|
| 287 |
"total": total,
|
| 288 |
-
"results": results
|
|
|
|
| 289 |
}
|
| 290 |
|
| 291 |
# Add errors if any occurred
|
|
@@ -519,6 +534,9 @@ def evaluate(prompt: str) -> dict:
|
|
| 519 |
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
| 520 |
"""
|
| 521 |
try:
|
|
|
|
|
|
|
|
|
|
| 522 |
# Load dataset
|
| 523 |
# Try loading with just dataset name first
|
| 524 |
try:
|
|
@@ -780,18 +798,20 @@ Your improved prompt here
|
|
| 780 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 781 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 782 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
|
|
|
| 783 |
"prompt": {
|
| 784 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 785 |
},
|
| 786 |
"evolution": {
|
| 787 |
"population_size": 10,
|
| 788 |
-
"num_islands": 1,
|
| 789 |
"elite_ratio": 0.1,
|
| 790 |
"explore_ratio": 0.3,
|
| 791 |
"exploit_ratio": 0.6,
|
| 792 |
},
|
| 793 |
"database": {
|
| 794 |
"log_prompts": True, # Save prompts used to generate each program
|
|
|
|
| 795 |
},
|
| 796 |
"evaluator": {
|
| 797 |
"timeout": 3600, # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
|
|
@@ -844,6 +864,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 844 |
config_path = create_config_file(model, work_dir)
|
| 845 |
|
| 846 |
# Run initial evaluation (using 20 samples to save API calls)
|
|
|
|
| 847 |
progress(0.2, desc="Running initial evaluation on 20 samples...")
|
| 848 |
initial_eval = evaluate_prompt(
|
| 849 |
initial_prompt, dataset_name, dataset_split, 20,
|
|
@@ -856,6 +877,9 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 856 |
if initial_eval["total"] == 0:
|
| 857 |
return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
|
| 858 |
|
|
|
|
|
|
|
|
|
|
| 859 |
initial_results = f"""
|
| 860 |
### Initial Prompt Evaluation
|
| 861 |
|
|
@@ -938,10 +962,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 938 |
else:
|
| 939 |
best_prompt = initial_prompt
|
| 940 |
|
| 941 |
-
# Evaluate best prompt
|
| 942 |
final_eval = evaluate_prompt(
|
| 943 |
best_prompt, dataset_name, dataset_split, 20,
|
| 944 |
-
model, input_field, target_field
|
|
|
|
| 945 |
)
|
| 946 |
|
| 947 |
final_results = f"""
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
|
| 138 |
+
model: str, input_field: str, target_field: str,
|
| 139 |
+
fixed_indices: List[int] = None) -> Dict:
|
| 140 |
+
"""
|
| 141 |
+
Evaluate a prompt on a dataset using the selected model.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
fixed_indices: Optional list of dataset indices to use. If provided,
|
| 145 |
+
ensures we evaluate on the SAME samples every time.
|
| 146 |
+
"""
|
| 147 |
try:
|
| 148 |
# Get API key from environment
|
| 149 |
api_key = os.environ.get("OPENAI_API_KEY")
|
|
|
|
| 167 |
else:
|
| 168 |
raise
|
| 169 |
|
| 170 |
+
# Sample examples - use fixed indices if provided to ensure consistency
|
| 171 |
+
if fixed_indices is not None:
|
| 172 |
+
# Use the provided indices (ensures same samples for initial/final eval)
|
| 173 |
+
indices = fixed_indices
|
| 174 |
+
samples = [dataset[i] for i in indices]
|
| 175 |
+
elif len(dataset) > num_samples:
|
| 176 |
+
# First time: use fixed seed for reproducible sampling
|
| 177 |
+
random.seed(42) # Fixed seed ensures same samples across runs
|
| 178 |
indices = random.sample(range(len(dataset)), num_samples)
|
| 179 |
samples = [dataset[i] for i in indices]
|
| 180 |
else:
|
| 181 |
+
indices = list(range(min(num_samples, len(dataset))))
|
| 182 |
samples = list(dataset)[:num_samples]
|
| 183 |
|
| 184 |
# Initialize OpenAI client with OpenRouter
|
|
|
|
| 299 |
"accuracy": accuracy,
|
| 300 |
"correct": correct,
|
| 301 |
"total": total,
|
| 302 |
+
"results": results,
|
| 303 |
+
"indices": indices # Return indices so we can reuse them for final eval
|
| 304 |
}
|
| 305 |
|
| 306 |
# Add errors if any occurred
|
|
|
|
| 534 |
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
| 535 |
"""
|
| 536 |
try:
|
| 537 |
+
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
| 538 |
+
random.seed(42)
|
| 539 |
+
|
| 540 |
# Load dataset
|
| 541 |
# Try loading with just dataset name first
|
| 542 |
try:
|
|
|
|
| 798 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 799 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 800 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
| 801 |
+
"num_islands": 1, # IMPORTANT: Use only 1 island (not 5) for simpler evolution
|
| 802 |
"prompt": {
|
| 803 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 804 |
},
|
| 805 |
"evolution": {
|
| 806 |
"population_size": 10,
|
| 807 |
+
"num_islands": 1, # Single island for simpler evolution
|
| 808 |
"elite_ratio": 0.1,
|
| 809 |
"explore_ratio": 0.3,
|
| 810 |
"exploit_ratio": 0.6,
|
| 811 |
},
|
| 812 |
"database": {
|
| 813 |
"log_prompts": True, # Save prompts used to generate each program
|
| 814 |
+
"num_islands": 1, # CRITICAL: This is where island count is actually read from!
|
| 815 |
},
|
| 816 |
"evaluator": {
|
| 817 |
"timeout": 3600, # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
|
|
|
|
| 864 |
config_path = create_config_file(model, work_dir)
|
| 865 |
|
| 866 |
# Run initial evaluation (using 20 samples to save API calls)
|
| 867 |
+
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 868 |
progress(0.2, desc="Running initial evaluation on 20 samples...")
|
| 869 |
initial_eval = evaluate_prompt(
|
| 870 |
initial_prompt, dataset_name, dataset_split, 20,
|
|
|
|
| 877 |
if initial_eval["total"] == 0:
|
| 878 |
return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
|
| 879 |
|
| 880 |
+
# Save the indices for final evaluation (ensures fair comparison)
|
| 881 |
+
eval_indices = initial_eval.get("indices", [])
|
| 882 |
+
|
| 883 |
initial_results = f"""
|
| 884 |
### Initial Prompt Evaluation
|
| 885 |
|
|
|
|
| 962 |
else:
|
| 963 |
best_prompt = initial_prompt
|
| 964 |
|
| 965 |
+
# Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
|
| 966 |
final_eval = evaluate_prompt(
|
| 967 |
best_prompt, dataset_name, dataset_split, 20,
|
| 968 |
+
model, input_field, target_field,
|
| 969 |
+
fixed_indices=eval_indices # Use same samples as initial eval!
|
| 970 |
)
|
| 971 |
|
| 972 |
final_results = f"""
|