codelion commited on
Commit
19d1d68
·
verified ·
1 Parent(s): ea59941

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -8
app.py CHANGED
@@ -135,8 +135,15 @@ def validate_inputs(dataset_name: str, split: str, input_field: str, target_fiel
135
 
136
 
137
  def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
138
- model: str, input_field: str, target_field: str) -> Dict:
139
- """Evaluate a prompt on a dataset using the selected model."""
 
 
 
 
 
 
 
140
  try:
141
  # Get API key from environment
142
  api_key = os.environ.get("OPENAI_API_KEY")
@@ -160,11 +167,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
160
  else:
161
  raise
162
 
163
- # Sample random examples
164
- if len(dataset) > num_samples:
 
 
 
 
 
 
165
  indices = random.sample(range(len(dataset)), num_samples)
166
  samples = [dataset[i] for i in indices]
167
  else:
 
168
  samples = list(dataset)[:num_samples]
169
 
170
  # Initialize OpenAI client with OpenRouter
@@ -285,7 +299,8 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
285
  "accuracy": accuracy,
286
  "correct": correct,
287
  "total": total,
288
- "results": results
 
289
  }
290
 
291
  # Add errors if any occurred
@@ -519,6 +534,9 @@ def evaluate(prompt: str) -> dict:
519
  Returns dict with combined_score (0-1), accuracy, correct, and total.
520
  """
521
  try:
 
 
 
522
  # Load dataset
523
  # Try loading with just dataset name first
524
  try:
@@ -780,18 +798,20 @@ Your improved prompt here
780
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
781
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
782
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
 
783
  "prompt": {
784
  "template_dir": templates_dir, # Use our custom prompt engineering templates
785
  },
786
  "evolution": {
787
  "population_size": 10,
788
- "num_islands": 1,
789
  "elite_ratio": 0.1,
790
  "explore_ratio": 0.3,
791
  "exploit_ratio": 0.6,
792
  },
793
  "database": {
794
  "log_prompts": True, # Save prompts used to generate each program
 
795
  },
796
  "evaluator": {
797
  "timeout": 3600, # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
@@ -844,6 +864,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
844
  config_path = create_config_file(model, work_dir)
845
 
846
  # Run initial evaluation (using 20 samples to save API calls)
 
847
  progress(0.2, desc="Running initial evaluation on 20 samples...")
848
  initial_eval = evaluate_prompt(
849
  initial_prompt, dataset_name, dataset_split, 20,
@@ -856,6 +877,9 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
856
  if initial_eval["total"] == 0:
857
  return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
858
 
 
 
 
859
  initial_results = f"""
860
  ### Initial Prompt Evaluation
861
 
@@ -938,10 +962,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
938
  else:
939
  best_prompt = initial_prompt
940
 
941
- # Evaluate best prompt (using 20 samples like initial eval for consistency)
942
  final_eval = evaluate_prompt(
943
  best_prompt, dataset_name, dataset_split, 20,
944
- model, input_field, target_field
 
945
  )
946
 
947
  final_results = f"""
 
135
 
136
 
137
  def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
138
+ model: str, input_field: str, target_field: str,
139
+ fixed_indices: List[int] = None) -> Dict:
140
+ """
141
+ Evaluate a prompt on a dataset using the selected model.
142
+
143
+ Args:
144
+ fixed_indices: Optional list of dataset indices to use. If provided,
145
+ ensures we evaluate on the SAME samples every time.
146
+ """
147
  try:
148
  # Get API key from environment
149
  api_key = os.environ.get("OPENAI_API_KEY")
 
167
  else:
168
  raise
169
 
170
+ # Sample examples - use fixed indices if provided to ensure consistency
171
+ if fixed_indices is not None:
172
+ # Use the provided indices (ensures same samples for initial/final eval)
173
+ indices = fixed_indices
174
+ samples = [dataset[i] for i in indices]
175
+ elif len(dataset) > num_samples:
176
+ # First time: use fixed seed for reproducible sampling
177
+ random.seed(42) # Fixed seed ensures same samples across runs
178
  indices = random.sample(range(len(dataset)), num_samples)
179
  samples = [dataset[i] for i in indices]
180
  else:
181
+ indices = list(range(min(num_samples, len(dataset))))
182
  samples = list(dataset)[:num_samples]
183
 
184
  # Initialize OpenAI client with OpenRouter
 
299
  "accuracy": accuracy,
300
  "correct": correct,
301
  "total": total,
302
+ "results": results,
303
+ "indices": indices # Return indices so we can reuse them for final eval
304
  }
305
 
306
  # Add errors if any occurred
 
534
  Returns dict with combined_score (0-1), accuracy, correct, and total.
535
  """
536
  try:
537
+ # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
538
+ random.seed(42)
539
+
540
  # Load dataset
541
  # Try loading with just dataset name first
542
  try:
 
798
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
799
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
800
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
801
+ "num_islands": 1, # IMPORTANT: Use only 1 island (not 5) for simpler evolution
802
  "prompt": {
803
  "template_dir": templates_dir, # Use our custom prompt engineering templates
804
  },
805
  "evolution": {
806
  "population_size": 10,
807
+ "num_islands": 1, # Single island for simpler evolution
808
  "elite_ratio": 0.1,
809
  "explore_ratio": 0.3,
810
  "exploit_ratio": 0.6,
811
  },
812
  "database": {
813
  "log_prompts": True, # Save prompts used to generate each program
814
+ "num_islands": 1, # CRITICAL: This is where island count is actually read from!
815
  },
816
  "evaluator": {
817
  "timeout": 3600, # 1 hour timeout (effectively disabled, but prevents NoneType arithmetic errors)
 
864
  config_path = create_config_file(model, work_dir)
865
 
866
  # Run initial evaluation (using 20 samples to save API calls)
867
+ # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
868
  progress(0.2, desc="Running initial evaluation on 20 samples...")
869
  initial_eval = evaluate_prompt(
870
  initial_prompt, dataset_name, dataset_split, 20,
 
877
  if initial_eval["total"] == 0:
878
  return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
879
 
880
+ # Save the indices for final evaluation (ensures fair comparison)
881
+ eval_indices = initial_eval.get("indices", [])
882
+
883
  initial_results = f"""
884
  ### Initial Prompt Evaluation
885
 
 
962
  else:
963
  best_prompt = initial_prompt
964
 
965
+ # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
966
  final_eval = evaluate_prompt(
967
  best_prompt, dataset_name, dataset_split, 20,
968
+ model, input_field, target_field,
969
+ fixed_indices=eval_indices # Use same samples as initial eval!
970
  )
971
 
972
  final_results = f"""