DataAnalysis_Env / tasks /task_hard.py
Mohammed-Altaf's picture
changes upper and lower bounds for inference grading
19b4563
import re
import pandas as pd
from tasks.base_task import BaseTask
class RepeatCustomerCohortTask(BaseTask):
"""Hard task: find customers who ordered in both January and December.
The agent must identify customers present in both months, count them,
and compare their average order value to all other customers.
"""
@property
def task_id(self) -> int:
return 3
@property
def difficulty(self) -> str:
return "hard"
@property
def description(self) -> str:
return (
"How many unique customers placed orders in BOTH January and December? "
"What is their average order value compared to all other customers? "
"Submit your answer in the format: "
"'Cohort: N customers, Cohort AOV: $X.XX, Other AOV: $X.XX'"
)
def _compute_cohort(self) -> tuple[set, float, float]:
"""Compute the cohort of customers ordering in both January and December.
Returns:
A tuple of (cohort_customer_ids, cohort_aov, other_aov).
"""
df = self.df.copy()
df["order_date"] = pd.to_datetime(df["order_date"])
jan_customers = set(df[df["order_date"].dt.month == 1]["customer_id"])
dec_customers = set(df[df["order_date"].dt.month == 12]["customer_id"])
cohort = jan_customers & dec_customers
cohort_aov = df[df["customer_id"].isin(cohort)]["total_price"].mean()
other_aov = df[~df["customer_id"].isin(cohort)]["total_price"].mean()
return cohort, round(cohort_aov, 2), round(other_aov, 2)
def expected_answer(self) -> str:
"""Compute the expected cohort analysis answer.
Returns:
Formatted string like 'Cohort: 57 customers, Cohort AOV: $126.57, Other AOV: $122.94'.
"""
cohort, cohort_aov, other_aov = self._compute_cohort()
return f"Cohort: {len(cohort)} customers, Cohort AOV: ${cohort_aov}, Other AOV: ${other_aov}"
def grade(self, answer: str) -> float:
"""Grade the answer with partial credit for each of the three fields.
Scoring:
- 0.33 for correct customer count (exact match)
- 0.33 for cohort AOV within ±0.5% of expected
- 0.34 for other AOV within ±0.5% of expected
Args:
answer: The agent's submitted answer string.
Returns:
A score between 0.0 and 1.0.
"""
cohort, expected_cohort_aov, expected_other_aov = self._compute_cohort()
expected_count = len(cohort)
score = 0.0
# Check customer count
count_match = re.search(r"Cohort:\s*(\d+)\s*customers?", answer, re.IGNORECASE)
if count_match:
if int(count_match.group(1)) == expected_count:
score += 0.33
# Check cohort AOV
cohort_aov_match = re.search(r"Cohort\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
if cohort_aov_match:
try:
submitted = float(cohort_aov_match.group(1))
tolerance = expected_cohort_aov * 0.005
if abs(submitted - expected_cohort_aov) <= tolerance:
score += 0.33
except ValueError:
pass
# Check other AOV
other_aov_match = re.search(r"Other\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
if other_aov_match:
try:
submitted = float(other_aov_match.group(1))
tolerance = expected_other_aov * 0.005
if abs(submitted - expected_other_aov) <= tolerance:
score += 0.34
except ValueError:
pass
return max(0.05, min(0.95, score))