Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on about 1 month ago

Commit

e99c626

verified ·

1 Parent(s): 359701c

Upload app.py

Browse files

Files changed (1) hide show

app.py +26 -12

app.py CHANGED Viewed

@@ -231,17 +231,24 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
-                # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
-                # This teaches evolution to add proper format instructions
                 pred_lower = prediction.lower()
-                # Check if response starts with the exact format (allow some whitespace)
-                if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
                     predicted_label = 1
-                elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
                     predicted_label = 0
                 else:
-                    # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
                     predicted_label = -1
                 is_correct = (predicted_label == true_label)
@@ -569,17 +576,24 @@ def evaluate(prompt: str) -> dict:
                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
-                # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
-                # This teaches evolution to add proper format instructions
                 pred_lower = prediction.lower()
-                # Check if response starts with the exact format (allow some whitespace)
-                if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
                     predicted_label = 1
-                elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
                     predicted_label = 0
                 else:
-                    # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
                     predicted_label = -1
                 is_correct = (predicted_label == true_label)

                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
+                # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
+                # This is strict enough to fail conversational responses, but learnable through evolution
                 pred_lower = prediction.lower()
+                pred_start = pred_lower[:150]  # First 150 chars
+                # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
+                has_sentiment_keyword = "sentiment" in pred_start
+                # Check for positive/negative indicators
+                has_positive = "positive" in pred_start
+                has_negative = "negative" in pred_start
+                # Only count as correct if sentiment keyword present AND unambiguous positive/negative
+                if has_sentiment_keyword and has_positive and not has_negative:
                     predicted_label = 1
+                elif has_sentiment_keyword and has_negative and not has_positive:
                     predicted_label = 0
                 else:
                     predicted_label = -1
                 is_correct = (predicted_label == true_label)
                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
+                # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
+                # This is strict enough to fail conversational responses, but learnable through evolution
                 pred_lower = prediction.lower()
+                pred_start = pred_lower[:150]  # First 150 chars
+                # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
+                has_sentiment_keyword = "sentiment" in pred_start
+                # Check for positive/negative indicators
+                has_positive = "positive" in pred_start
+                has_negative = "negative" in pred_start
+                # Only count as correct if sentiment keyword present AND unambiguous positive/negative
+                if has_sentiment_keyword and has_positive and not has_negative:
                     predicted_label = 1
+                elif has_sentiment_keyword and has_negative and not has_positive:
                     predicted_label = 0
                 else:
                     predicted_label = -1
                 is_correct = (predicted_label == true_label)