Upload app.py
Browse files
app.py
CHANGED
|
@@ -187,8 +187,25 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 187 |
target_str = str(target).lower().strip()
|
| 188 |
pred_lower = prediction.lower()
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
# Check exact match first
|
| 191 |
-
is_correct = target_str in pred_lower
|
| 192 |
|
| 193 |
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 194 |
if not is_correct:
|
|
@@ -441,6 +458,7 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
|
|
| 441 |
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
|
| 442 |
evaluator_code = f'''
|
| 443 |
import os
|
|
|
|
| 444 |
import random
|
| 445 |
from datasets import load_dataset
|
| 446 |
from openai import OpenAI
|
|
@@ -505,8 +523,25 @@ def evaluate(prompt: str) -> dict:
|
|
| 505 |
target_str = str(target).lower().strip()
|
| 506 |
pred_lower = prediction.lower()
|
| 507 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
# Check exact match first
|
| 509 |
-
is_correct = target_str in pred_lower
|
| 510 |
|
| 511 |
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 512 |
if not is_correct:
|
|
@@ -929,12 +964,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 929 |
|
| 930 |
## How it works:
|
| 931 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 932 |
-
2.
|
| 933 |
-
3. Specify the dataset split and field names
|
| 934 |
4. Choose a free model from OpenRouter
|
| 935 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 936 |
6. Watch the evolution progress in real-time
|
| 937 |
-
7. Compare initial vs. evolved performance!
|
| 938 |
|
| 939 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 940 |
""")
|
|
@@ -952,34 +987,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 952 |
|
| 953 |
dataset_name = gr.Textbox(
|
| 954 |
label="HuggingFace Dataset (Full Name)",
|
| 955 |
-
value="
|
| 956 |
-
placeholder="e.g., stanfordnlp/imdb, openai/gsm8k
|
| 957 |
info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
|
| 958 |
)
|
| 959 |
|
| 960 |
dataset_split = gr.Textbox(
|
| 961 |
label="Dataset Split",
|
| 962 |
-
value="
|
| 963 |
placeholder="e.g., train, test, validation"
|
| 964 |
)
|
| 965 |
|
| 966 |
input_field = gr.Textbox(
|
| 967 |
label="Input Field Name",
|
| 968 |
-
value="
|
| 969 |
-
placeholder="e.g.,
|
| 970 |
info="The field containing inputs to process"
|
| 971 |
)
|
| 972 |
|
| 973 |
target_field = gr.Textbox(
|
| 974 |
label="Target Field Name",
|
| 975 |
-
value="
|
| 976 |
-
placeholder="e.g.,
|
| 977 |
info="The field containing expected outputs"
|
| 978 |
)
|
| 979 |
|
| 980 |
initial_prompt = gr.TextArea(
|
| 981 |
label="Initial Prompt",
|
| 982 |
-
value="{input}\n\
|
| 983 |
lines=6,
|
| 984 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 985 |
)
|
|
|
|
| 187 |
target_str = str(target).lower().strip()
|
| 188 |
pred_lower = prediction.lower()
|
| 189 |
|
| 190 |
+
# Extract numeric answer from GSM8K format (e.g., "#### 42")
|
| 191 |
+
def extract_numeric_answer(text):
|
| 192 |
+
"""Extract final numeric answer from GSM8K format or general text"""
|
| 193 |
+
# Try GSM8K format first: #### NUMBER
|
| 194 |
+
match = re.search(r'####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
|
| 195 |
+
if match:
|
| 196 |
+
return match.group(1).replace(',', '').strip()
|
| 197 |
+
# Otherwise try to find any number in the text
|
| 198 |
+
match = re.search(r'(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
|
| 199 |
+
if match:
|
| 200 |
+
return match.group(1).replace(',', '').strip()
|
| 201 |
+
return text.strip()
|
| 202 |
+
|
| 203 |
+
# Extract numeric answers for comparison
|
| 204 |
+
target_numeric = extract_numeric_answer(str(target))
|
| 205 |
+
pred_numeric = extract_numeric_answer(prediction)
|
| 206 |
+
|
| 207 |
# Check exact match first
|
| 208 |
+
is_correct = target_str in pred_lower or pred_numeric == target_numeric
|
| 209 |
|
| 210 |
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 211 |
if not is_correct:
|
|
|
|
| 458 |
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
|
| 459 |
evaluator_code = f'''
|
| 460 |
import os
|
| 461 |
+
import re
|
| 462 |
import random
|
| 463 |
from datasets import load_dataset
|
| 464 |
from openai import OpenAI
|
|
|
|
| 523 |
target_str = str(target).lower().strip()
|
| 524 |
pred_lower = prediction.lower()
|
| 525 |
|
| 526 |
+
# Extract numeric answer from GSM8K format (e.g., "#### 42")
|
| 527 |
+
def extract_numeric_answer(text):
|
| 528 |
+
"""Extract final numeric answer from GSM8K format or general text"""
|
| 529 |
+
# Try GSM8K format first: #### NUMBER
|
| 530 |
+
match = re.search(r'####\\s*(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
|
| 531 |
+
if match:
|
| 532 |
+
return match.group(1).replace(',', '').strip()
|
| 533 |
+
# Otherwise try to find any number in the text
|
| 534 |
+
match = re.search(r'(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
|
| 535 |
+
if match:
|
| 536 |
+
return match.group(1).replace(',', '').strip()
|
| 537 |
+
return text.strip()
|
| 538 |
+
|
| 539 |
+
# Extract numeric answers for comparison
|
| 540 |
+
target_numeric = extract_numeric_answer(str(target))
|
| 541 |
+
pred_numeric = extract_numeric_answer(prediction)
|
| 542 |
+
|
| 543 |
# Check exact match first
|
| 544 |
+
is_correct = target_str in pred_lower or pred_numeric == target_numeric
|
| 545 |
|
| 546 |
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 547 |
if not is_correct:
|
|
|
|
| 964 |
|
| 965 |
## How it works:
|
| 966 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 967 |
+
2. Default dataset is **GSM8K** (grade school math problems) - challenging for showing improvement!
|
| 968 |
+
3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
|
| 969 |
4. Choose a free model from OpenRouter
|
| 970 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 971 |
6. Watch the evolution progress in real-time
|
| 972 |
+
7. Compare initial vs. evolved performance - expect 20-40% improvement on GSM8K!
|
| 973 |
|
| 974 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 975 |
""")
|
|
|
|
| 987 |
|
| 988 |
dataset_name = gr.Textbox(
|
| 989 |
label="HuggingFace Dataset (Full Name)",
|
| 990 |
+
value="gsm8k",
|
| 991 |
+
placeholder="e.g., gsm8k, stanfordnlp/imdb, openai/gsm8k",
|
| 992 |
info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
|
| 993 |
)
|
| 994 |
|
| 995 |
dataset_split = gr.Textbox(
|
| 996 |
label="Dataset Split",
|
| 997 |
+
value="train",
|
| 998 |
placeholder="e.g., train, test, validation"
|
| 999 |
)
|
| 1000 |
|
| 1001 |
input_field = gr.Textbox(
|
| 1002 |
label="Input Field Name",
|
| 1003 |
+
value="question",
|
| 1004 |
+
placeholder="e.g., question, text, context",
|
| 1005 |
info="The field containing inputs to process"
|
| 1006 |
)
|
| 1007 |
|
| 1008 |
target_field = gr.Textbox(
|
| 1009 |
label="Target Field Name",
|
| 1010 |
+
value="answer",
|
| 1011 |
+
placeholder="e.g., answer, label, target",
|
| 1012 |
info="The field containing expected outputs"
|
| 1013 |
)
|
| 1014 |
|
| 1015 |
initial_prompt = gr.TextArea(
|
| 1016 |
label="Initial Prompt",
|
| 1017 |
+
value="{input}\n\nAnswer:",
|
| 1018 |
lines=6,
|
| 1019 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 1020 |
)
|