codelion commited on
Commit
e99c626
·
verified ·
1 Parent(s): 359701c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -12
app.py CHANGED
@@ -231,17 +231,24 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
231
  # IMDB labels: 0 = negative, 1 = positive
232
  true_label = int(target) # 0 or 1
233
 
234
- # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
235
- # This teaches evolution to add proper format instructions
236
  pred_lower = prediction.lower()
 
237
 
238
- # Check if response starts with the exact format (allow some whitespace)
239
- if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
 
 
 
 
 
 
 
240
  predicted_label = 1
241
- elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
242
  predicted_label = 0
243
  else:
244
- # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
245
  predicted_label = -1
246
 
247
  is_correct = (predicted_label == true_label)
@@ -569,17 +576,24 @@ def evaluate(prompt: str) -> dict:
569
  # IMDB labels: 0 = negative, 1 = positive
570
  true_label = int(target) # 0 or 1
571
 
572
- # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
573
- # This teaches evolution to add proper format instructions
574
  pred_lower = prediction.lower()
 
 
 
 
 
 
 
 
575
 
576
- # Check if response starts with the exact format (allow some whitespace)
577
- if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
578
  predicted_label = 1
579
- elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
580
  predicted_label = 0
581
  else:
582
- # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
583
  predicted_label = -1
584
 
585
  is_correct = (predicted_label == true_label)
 
231
  # IMDB labels: 0 = negative, 1 = positive
232
  true_label = int(target) # 0 or 1
233
 
234
+ # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
235
+ # This is strict enough to fail conversational responses, but learnable through evolution
236
  pred_lower = prediction.lower()
237
+ pred_start = pred_lower[:150] # First 150 chars
238
 
239
+ # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
240
+ has_sentiment_keyword = "sentiment" in pred_start
241
+
242
+ # Check for positive/negative indicators
243
+ has_positive = "positive" in pred_start
244
+ has_negative = "negative" in pred_start
245
+
246
+ # Only count as correct if sentiment keyword present AND unambiguous positive/negative
247
+ if has_sentiment_keyword and has_positive and not has_negative:
248
  predicted_label = 1
249
+ elif has_sentiment_keyword and has_negative and not has_positive:
250
  predicted_label = 0
251
  else:
 
252
  predicted_label = -1
253
 
254
  is_correct = (predicted_label == true_label)
 
576
  # IMDB labels: 0 = negative, 1 = positive
577
  true_label = int(target) # 0 or 1
578
 
579
+ # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
580
+ # This is strict enough to fail conversational responses, but learnable through evolution
581
  pred_lower = prediction.lower()
582
+ pred_start = pred_lower[:150] # First 150 chars
583
+
584
+ # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
585
+ has_sentiment_keyword = "sentiment" in pred_start
586
+
587
+ # Check for positive/negative indicators
588
+ has_positive = "positive" in pred_start
589
+ has_negative = "negative" in pred_start
590
 
591
+ # Only count as correct if sentiment keyword present AND unambiguous positive/negative
592
+ if has_sentiment_keyword and has_positive and not has_negative:
593
  predicted_label = 1
594
+ elif has_sentiment_keyword and has_negative and not has_positive:
595
  predicted_label = 0
596
  else:
 
597
  predicted_label = -1
598
 
599
  is_correct = (predicted_label == true_label)