codelion commited on
Commit
359701c
·
verified ·
1 Parent(s): 9e6170e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -32
app.py CHANGED
@@ -226,30 +226,22 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
226
  max_tokens=500,
227
  )
228
 
229
- prediction = response.choices[0].message.content.strip().lower()
230
 
231
  # IMDB labels: 0 = negative, 1 = positive
232
  true_label = int(target) # 0 or 1
233
 
234
- # Check for sentiment classification in first 100 chars (to avoid long explanations)
235
- pred_start = prediction[:100]
 
236
 
237
- # Look for clear positive/negative indicators
238
- has_positive = ("positive" in pred_start and "sentiment" in pred_start) or \
239
- ("this is positive" in pred_start) or \
240
- ("sentiment: positive" in pred_start)
241
-
242
- has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
243
- ("this is negative" in pred_start) or \
244
- ("sentiment: negative" in pred_start)
245
-
246
- # Prediction must be unambiguous
247
- if has_positive and not has_negative:
248
  predicted_label = 1
249
- elif has_negative and not has_positive:
250
  predicted_label = 0
251
  else:
252
- # Ambiguous or no clear signal = wrong
253
  predicted_label = -1
254
 
255
  is_correct = (predicted_label == true_label)
@@ -572,30 +564,22 @@ def evaluate(prompt: str) -> dict:
572
  max_tokens=500,
573
  )
574
 
575
- prediction = response.choices[0].message.content.strip().lower()
576
 
577
  # IMDB labels: 0 = negative, 1 = positive
578
  true_label = int(target) # 0 or 1
579
 
580
- # Check for sentiment classification in first 100 chars (to avoid long explanations)
581
- pred_start = prediction[:100]
582
-
583
- # Look for clear positive/negative indicators
584
- has_positive = ("positive" in pred_start and "sentiment" in pred_start) or \
585
- ("this is positive" in pred_start) or \
586
- ("sentiment: positive" in pred_start)
587
-
588
- has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
589
- ("this is negative" in pred_start) or \
590
- ("sentiment: negative" in pred_start)
591
 
592
- # Prediction must be unambiguous
593
- if has_positive and not has_negative:
594
  predicted_label = 1
595
- elif has_negative and not has_positive:
596
  predicted_label = 0
597
  else:
598
- # Ambiguous or no clear signal = wrong
599
  predicted_label = -1
600
 
601
  is_correct = (predicted_label == true_label)
 
226
  max_tokens=500,
227
  )
228
 
229
+ prediction = response.choices[0].message.content.strip()
230
 
231
  # IMDB labels: 0 = negative, 1 = positive
232
  true_label = int(target) # 0 or 1
233
 
234
+ # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
235
+ # This teaches evolution to add proper format instructions
236
+ pred_lower = prediction.lower()
237
 
238
+ # Check if response starts with the exact format (allow some whitespace)
239
+ if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
 
 
 
 
 
 
 
 
 
240
  predicted_label = 1
241
+ elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
242
  predicted_label = 0
243
  else:
244
+ # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
245
  predicted_label = -1
246
 
247
  is_correct = (predicted_label == true_label)
 
564
  max_tokens=500,
565
  )
566
 
567
+ prediction = response.choices[0].message.content.strip()
568
 
569
  # IMDB labels: 0 = negative, 1 = positive
570
  true_label = int(target) # 0 or 1
571
 
572
+ # STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
573
+ # This teaches evolution to add proper format instructions
574
+ pred_lower = prediction.lower()
 
 
 
 
 
 
 
 
575
 
576
+ # Check if response starts with the exact format (allow some whitespace)
577
+ if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
578
  predicted_label = 1
579
+ elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
580
  predicted_label = 0
581
  else:
582
+ # Wrong format = incorrect (even if sentiment is mentioned elsewhere)
583
  predicted_label = -1
584
 
585
  is_correct = (predicted_label == true_label)