codelion commited on
Commit
bd94785
·
verified ·
1 Parent(s): 741a123

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -13
app.py CHANGED
@@ -187,8 +187,25 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
187
  target_str = str(target).lower().strip()
188
  pred_lower = prediction.lower()
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # Check exact match first
191
- is_correct = target_str in pred_lower
192
 
193
  # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
194
  if not is_correct:
@@ -441,6 +458,7 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
441
  """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
442
  evaluator_code = f'''
443
  import os
 
444
  import random
445
  from datasets import load_dataset
446
  from openai import OpenAI
@@ -505,8 +523,25 @@ def evaluate(prompt: str) -> dict:
505
  target_str = str(target).lower().strip()
506
  pred_lower = prediction.lower()
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  # Check exact match first
509
- is_correct = target_str in pred_lower
510
 
511
  # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
512
  if not is_correct:
@@ -929,12 +964,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
929
 
930
  ## How it works:
931
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
932
- 2. Enter the full HuggingFace dataset name (e.g., `stanfordnlp/imdb`, `gsm8k`)
933
- 3. Specify the dataset split and field names
934
  4. Choose a free model from OpenRouter
935
  5. Click "Optimize Prompt" - the system will validate everything first!
936
  6. Watch the evolution progress in real-time
937
- 7. Compare initial vs. evolved performance!
938
 
939
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
940
  """)
@@ -952,34 +987,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
952
 
953
  dataset_name = gr.Textbox(
954
  label="HuggingFace Dataset (Full Name)",
955
- value="stanfordnlp/imdb",
956
- placeholder="e.g., stanfordnlp/imdb, openai/gsm8k, SetFit/sst5",
957
  info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
958
  )
959
 
960
  dataset_split = gr.Textbox(
961
  label="Dataset Split",
962
- value="test",
963
  placeholder="e.g., train, test, validation"
964
  )
965
 
966
  input_field = gr.Textbox(
967
  label="Input Field Name",
968
- value="text",
969
- placeholder="e.g., text, question, context",
970
  info="The field containing inputs to process"
971
  )
972
 
973
  target_field = gr.Textbox(
974
  label="Target Field Name",
975
- value="label",
976
- placeholder="e.g., label, answer, target",
977
  info="The field containing expected outputs"
978
  )
979
 
980
  initial_prompt = gr.TextArea(
981
  label="Initial Prompt",
982
- value="{input}\n\nSentiment:",
983
  lines=6,
984
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
985
  )
 
187
  target_str = str(target).lower().strip()
188
  pred_lower = prediction.lower()
189
 
190
+ # Extract numeric answer from GSM8K format (e.g., "#### 42")
191
+ def extract_numeric_answer(text):
192
+ """Extract final numeric answer from GSM8K format or general text"""
193
+ # Try GSM8K format first: #### NUMBER
194
+ match = re.search(r'####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
195
+ if match:
196
+ return match.group(1).replace(',', '').strip()
197
+ # Otherwise try to find any number in the text
198
+ match = re.search(r'(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
199
+ if match:
200
+ return match.group(1).replace(',', '').strip()
201
+ return text.strip()
202
+
203
+ # Extract numeric answers for comparison
204
+ target_numeric = extract_numeric_answer(str(target))
205
+ pred_numeric = extract_numeric_answer(prediction)
206
+
207
  # Check exact match first
208
+ is_correct = target_str in pred_lower or pred_numeric == target_numeric
209
 
210
  # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
211
  if not is_correct:
 
458
  """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
459
  evaluator_code = f'''
460
  import os
461
+ import re
462
  import random
463
  from datasets import load_dataset
464
  from openai import OpenAI
 
523
  target_str = str(target).lower().strip()
524
  pred_lower = prediction.lower()
525
 
526
+ # Extract numeric answer from GSM8K format (e.g., "#### 42")
527
+ def extract_numeric_answer(text):
528
+ """Extract final numeric answer from GSM8K format or general text"""
529
+ # Try GSM8K format first: #### NUMBER
530
+ match = re.search(r'####\\s*(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
531
+ if match:
532
+ return match.group(1).replace(',', '').strip()
533
+ # Otherwise try to find any number in the text
534
+ match = re.search(r'(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
535
+ if match:
536
+ return match.group(1).replace(',', '').strip()
537
+ return text.strip()
538
+
539
+ # Extract numeric answers for comparison
540
+ target_numeric = extract_numeric_answer(str(target))
541
+ pred_numeric = extract_numeric_answer(prediction)
542
+
543
  # Check exact match first
544
+ is_correct = target_str in pred_lower or pred_numeric == target_numeric
545
 
546
  # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
547
  if not is_correct:
 
964
 
965
  ## How it works:
966
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
967
+ 2. Default dataset is **GSM8K** (grade school math problems) - challenging for showing improvement!
968
+ 3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
969
  4. Choose a free model from OpenRouter
970
  5. Click "Optimize Prompt" - the system will validate everything first!
971
  6. Watch the evolution progress in real-time
972
+ 7. Compare initial vs. evolved performance - expect 20-40% improvement on GSM8K!
973
 
974
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
975
  """)
 
987
 
988
  dataset_name = gr.Textbox(
989
  label="HuggingFace Dataset (Full Name)",
990
+ value="gsm8k",
991
+ placeholder="e.g., gsm8k, stanfordnlp/imdb, openai/gsm8k",
992
  info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
993
  )
994
 
995
  dataset_split = gr.Textbox(
996
  label="Dataset Split",
997
+ value="train",
998
  placeholder="e.g., train, test, validation"
999
  )
1000
 
1001
  input_field = gr.Textbox(
1002
  label="Input Field Name",
1003
+ value="question",
1004
+ placeholder="e.g., question, text, context",
1005
  info="The field containing inputs to process"
1006
  )
1007
 
1008
  target_field = gr.Textbox(
1009
  label="Target Field Name",
1010
+ value="answer",
1011
+ placeholder="e.g., answer, label, target",
1012
  info="The field containing expected outputs"
1013
  )
1014
 
1015
  initial_prompt = gr.TextArea(
1016
  label="Initial Prompt",
1017
+ value="{input}\n\nAnswer:",
1018
  lines=6,
1019
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1020
  )