Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

DataAnalysis_Env / tasks /task_hard.py

Mohammed-Altaf

changes upper and lower bounds for inference grading

19b4563 about 1 month ago

raw

history blame contribute delete

3.73 kB

	import re

	import pandas as pd

	from tasks.base_task import BaseTask


	class RepeatCustomerCohortTask(BaseTask):
	"""Hard task: find customers who ordered in both January and December.

	The agent must identify customers present in both months, count them,
	and compare their average order value to all other customers.
	"""

	@property
	def task_id(self) -> int:
	return 3

	@property
	def difficulty(self) -> str:
	return "hard"

	@property
	def description(self) -> str:
	return (
	"How many unique customers placed orders in BOTH January and December? "
	"What is their average order value compared to all other customers? "
	"Submit your answer in the format: "
	"'Cohort: N customers, Cohort AOV: $X.XX, Other AOV: $X.XX'"
	)

	def _compute_cohort(self) -> tuple[set, float, float]:
	"""Compute the cohort of customers ordering in both January and December.

	Returns:
	A tuple of (cohort_customer_ids, cohort_aov, other_aov).
	"""
	df = self.df.copy()
	df["order_date"] = pd.to_datetime(df["order_date"])
	jan_customers = set(df[df["order_date"].dt.month == 1]["customer_id"])
	dec_customers = set(df[df["order_date"].dt.month == 12]["customer_id"])
	cohort = jan_customers & dec_customers

	cohort_aov = df[df["customer_id"].isin(cohort)]["total_price"].mean()
	other_aov = df[~df["customer_id"].isin(cohort)]["total_price"].mean()
	return cohort, round(cohort_aov, 2), round(other_aov, 2)

	def expected_answer(self) -> str:
	"""Compute the expected cohort analysis answer.

	Returns:
	Formatted string like 'Cohort: 57 customers, Cohort AOV: $126.57, Other AOV: $122.94'.
	"""
	cohort, cohort_aov, other_aov = self._compute_cohort()
	return f"Cohort: {len(cohort)} customers, Cohort AOV: ${cohort_aov}, Other AOV: ${other_aov}"

	def grade(self, answer: str) -> float:
	"""Grade the answer with partial credit for each of the three fields.

	Scoring:
	- 0.33 for correct customer count (exact match)
	- 0.33 for cohort AOV within ±0.5% of expected
	- 0.34 for other AOV within ±0.5% of expected

	Args:
	answer: The agent's submitted answer string.

	Returns:
	A score between 0.0 and 1.0.
	"""
	cohort, expected_cohort_aov, expected_other_aov = self._compute_cohort()
	expected_count = len(cohort)
	score = 0.0

	# Check customer count
	count_match = re.search(r"Cohort:\s(\d+)\scustomers?", answer, re.IGNORECASE)
	if count_match:
	if int(count_match.group(1)) == expected_count:
	score += 0.33

	# Check cohort AOV
	cohort_aov_match = re.search(r"Cohort\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
	if cohort_aov_match:
	try:
	submitted = float(cohort_aov_match.group(1))
	tolerance = expected_cohort_aov * 0.005
	if abs(submitted - expected_cohort_aov) <= tolerance:
	score += 0.33
	except ValueError:
	pass

	# Check other AOV
	other_aov_match = re.search(r"Other\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
	if other_aov_match:
	try:
	submitted = float(other_aov_match.group(1))
	tolerance = expected_other_aov * 0.005
	if abs(submitted - expected_other_aov) <= tolerance:
	score += 0.34
	except ValueError:
	pass

	return max(0.05, min(0.95, score))