Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 27 days ago

Commit

9e6170e

verified ·

1 Parent(s): 74ad440

Upload app.py

Browse files

Files changed (1) hide show

app.py +48 -113

app.py CHANGED Viewed

@@ -226,67 +226,33 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                     max_tokens=500,
                 )
-                prediction = response.choices[0].message.content.strip()
-                # Smart evaluation - handle both math and text answers
-                target_str = str(target).strip()
-                pred_str = prediction.strip()
-                def extract_answer(text):
-                    """Extract answer from text - handles GSM8K format and general text"""
-                    # GSM8K format: "#### NUMBER" at the end
-                    if "####" in text:
-                        parts = text.split("####")
-                        if len(parts) > 1:
-                            answer_part = parts[-1].strip()
-                            # Remove comma separators (1,000 -> 1000)
-                            answer_part = answer_part.replace(',', '')
-                            return answer_part
-                    # Try to extract last number from free-form text
-                    numbers = re.findall(r'-?\d+(?:,\d{3})*(?:\.\d+)?', text)
-                    if numbers:
-                        # Return the last number found (usually the final answer)
-                        return numbers[-1].replace(',', '')
-                    return text
-                def is_mathematically_equal(str1, str2):
-                    """Check if two strings represent the same mathematical value"""
-                    try:
-                        # Try to convert both to floats and compare
-                        num1 = float(str1.replace(',', ''))
-                        num2 = float(str2.replace(',', ''))
-                        # Use small epsilon for float comparison
-                        return abs(num1 - num2) < 1e-6
-                    except (ValueError, AttributeError):
-                        # If conversion fails, do string comparison
-                        return str1.lower().strip() == str2.lower().strip()
-                # Extract answers
-                target_answer = extract_answer(target_str)
-                pred_answer = extract_answer(pred_str)
-                # Check if answers match mathematically or textually
-                is_correct = is_mathematically_equal(target_answer, pred_answer)
-                # Fallback: check for semantic equivalents for sentiment analysis
-                if not is_correct:
-                    target_lower = target_answer.lower()
-                    pred_lower = pred_answer.lower()
-                    # Sentiment mappings with expanded synonyms
-                    positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
-                                     "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
-                                     "praise", "favorable", "approve"]
-                    negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
-                                     "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
-                                     "critique", "condemn", "sarcasm"]
-                    if target_lower in ["1", "positive", "pos"]:
-                        is_correct = any(word in pred_lower for word in positive_words)
-                    elif target_lower in ["0", "negative", "neg"]:
-                        is_correct = any(word in pred_lower for word in negative_words)
                 if is_correct:
                     correct += 1
@@ -606,64 +572,33 @@ def evaluate(prompt: str) -> dict:
                     max_tokens=500,
                 )
-                prediction = response.choices[0].message.content.strip()
-                # Smart evaluation - handle both math and text answers
-                target_str = str(target).strip()
-                pred_str = prediction.strip()
-                def extract_answer(text):
-                    """Extract answer from text - handles GSM8K format and general text"""
-                    import re
-                    # GSM8K format: "#### NUMBER" at the end
-                    if "####" in text:
-                        parts = text.split("####")
-                        if len(parts) > 1:
-                            answer_part = parts[-1].strip()
-                            answer_part = answer_part.replace(',', '')
-                            return answer_part
-                    # Try to extract last number from free-form text
-                    numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
-                    if numbers:
-                        return numbers[-1].replace(',', '')
-                    return text
-                def is_mathematically_equal(str1, str2):
-                    """Check if two strings represent the same mathematical value"""
-                    try:
-                        num1 = float(str1.replace(',', ''))
-                        num2 = float(str2.replace(',', ''))
-                        return abs(num1 - num2) < 1e-6
-                    except (ValueError, AttributeError):
-                        return str1.lower().strip() == str2.lower().strip()
-                # Extract answers
-                target_answer = extract_answer(target_str)
-                pred_answer = extract_answer(pred_str)
-                # Check if answers match mathematically or textually
-                is_correct = is_mathematically_equal(target_answer, pred_answer)
-                # Fallback: check for semantic equivalents for sentiment analysis
-                if not is_correct:
-                    target_lower = target_answer.lower()
-                    pred_lower = pred_answer.lower()
-                    # Sentiment mappings with expanded synonyms
-                    positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
-                                     "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
-                                     "praise", "favorable", "approve"]
-                    negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
-                                     "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
-                                     "critique", "condemn", "sarcasm"]
-                    if target_lower in ["1", "positive", "pos"]:
-                        is_correct = any(word in pred_lower for word in positive_words)
-                    elif target_lower in ["0", "negative", "neg"]:
-                        is_correct = any(word in pred_lower for word in negative_words)
                 if is_correct:
                     correct += 1

                     max_tokens=500,
                 )
+                prediction = response.choices[0].message.content.strip().lower()
+                # IMDB labels: 0 = negative, 1 = positive
+                true_label = int(target)  # 0 or 1
+                # Check for sentiment classification in first 100 chars (to avoid long explanations)
+                pred_start = prediction[:100]
+                # Look for clear positive/negative indicators
+                has_positive = ("positive" in pred_start and "sentiment" in pred_start) or \
+                               ("this is positive" in pred_start) or \
+                               ("sentiment: positive" in pred_start)
+                has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
+                               ("this is negative" in pred_start) or \
+                               ("sentiment: negative" in pred_start)
+                # Prediction must be unambiguous
+                if has_positive and not has_negative:
+                    predicted_label = 1
+                elif has_negative and not has_positive:
+                    predicted_label = 0
+                else:
+                    # Ambiguous or no clear signal = wrong
+                    predicted_label = -1
+                is_correct = (predicted_label == true_label)
                 if is_correct:
                     correct += 1
                     max_tokens=500,
                 )
+                prediction = response.choices[0].message.content.strip().lower()
+                # IMDB labels: 0 = negative, 1 = positive
+                true_label = int(target)  # 0 or 1
+                # Check for sentiment classification in first 100 chars (to avoid long explanations)
+                pred_start = prediction[:100]
+                # Look for clear positive/negative indicators
+                has_positive = ("positive" in pred_start and "sentiment" in pred_start) or \
+                               ("this is positive" in pred_start) or \
+                               ("sentiment: positive" in pred_start)
+                has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
+                               ("this is negative" in pred_start) or \
+                               ("sentiment: negative" in pred_start)
+                # Prediction must be unambiguous
+                if has_positive and not has_negative:
+                    predicted_label = 1
+                elif has_negative and not has_positive:
+                    predicted_label = 0
+                else:
+                    # Ambiguous or no clear signal = wrong
+                    predicted_label = -1
+                is_correct = (predicted_label == true_label)
                 if is_correct:
                     correct += 1