Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on about 1 month ago

Commit

bd94785

verified ·

1 Parent(s): 741a123

Upload app.py

Browse files

Files changed (1) hide show

app.py +48 -13

app.py CHANGED Viewed

@@ -187,8 +187,25 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 target_str = str(target).lower().strip()
                 pred_lower = prediction.lower()
                 # Check exact match first
-                is_correct = target_str in pred_lower
                 # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                 if not is_correct:
@@ -441,6 +458,7 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
     """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
 import random
 from datasets import load_dataset
 from openai import OpenAI
@@ -505,8 +523,25 @@ def evaluate(prompt: str) -> dict:
                     target_str = str(target).lower().strip()
                     pred_lower = prediction.lower()
                     # Check exact match first
-                    is_correct = target_str in pred_lower
                     # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                     if not is_correct:
@@ -929,12 +964,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
-    2. Enter the full HuggingFace dataset name (e.g., `stanfordnlp/imdb`, `gsm8k`)
-    3. Specify the dataset split and field names
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
-    7. Compare initial vs. evolved performance!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
@@ -952,34 +987,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
-                value="stanfordnlp/imdb",
-                placeholder="e.g., stanfordnlp/imdb, openai/gsm8k, SetFit/sst5",
                 info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
-                value="test",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
-                value="text",
-                placeholder="e.g., text, question, context",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
-                value="label",
-                placeholder="e.g., label, answer, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
-                value="{input}\n\nSentiment:",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )

                 target_str = str(target).lower().strip()
                 pred_lower = prediction.lower()
+                # Extract numeric answer from GSM8K format (e.g., "#### 42")
+                def extract_numeric_answer(text):
+                    """Extract final numeric answer from GSM8K format or general text"""
+                    # Try GSM8K format first: #### NUMBER
+                    match = re.search(r'####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
+                    if match:
+                        return match.group(1).replace(',', '').strip()
+                    # Otherwise try to find any number in the text
+                    match = re.search(r'(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
+                    if match:
+                        return match.group(1).replace(',', '').strip()
+                    return text.strip()
+                # Extract numeric answers for comparison
+                target_numeric = extract_numeric_answer(str(target))
+                pred_numeric = extract_numeric_answer(prediction)
                 # Check exact match first
+                is_correct = target_str in pred_lower or pred_numeric == target_numeric
                 # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                 if not is_correct:
     """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
+import re
 import random
 from datasets import load_dataset
 from openai import OpenAI
                     target_str = str(target).lower().strip()
                     pred_lower = prediction.lower()
+                    # Extract numeric answer from GSM8K format (e.g., "#### 42")
+                    def extract_numeric_answer(text):
+                        """Extract final numeric answer from GSM8K format or general text"""
+                        # Try GSM8K format first: #### NUMBER
+                        match = re.search(r'####\\s*(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
+                        if match:
+                            return match.group(1).replace(',', '').strip()
+                        # Otherwise try to find any number in the text
+                        match = re.search(r'(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
+                        if match:
+                            return match.group(1).replace(',', '').strip()
+                        return text.strip()
+                    # Extract numeric answers for comparison
+                    target_numeric = extract_numeric_answer(str(target))
+                    pred_numeric = extract_numeric_answer(prediction)
                     # Check exact match first
+                    is_correct = target_str in pred_lower or pred_numeric == target_numeric
                     # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                     if not is_correct:
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Default dataset is **GSM8K** (grade school math problems) - challenging for showing improvement!
+    3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
+    7. Compare initial vs. evolved performance - expect 20-40% improvement on GSM8K!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
+                value="gsm8k",
+                placeholder="e.g., gsm8k, stanfordnlp/imdb, openai/gsm8k",
                 info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
+                value="train",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
+                value="question",
+                placeholder="e.g., question, text, context",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
+                value="answer",
+                placeholder="e.g., answer, label, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
+                value="{input}\n\nAnswer:",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )