Spaces:
Sleeping
Sleeping
| """ | |
| Task definitions and automated graders for the DataDetective environment. | |
| Each task has: | |
| - id, title, difficulty, description | |
| - A grader function that scores the agent's final answer (0.0 - 1.0) | |
| based on whether key findings are mentioned. | |
| """ | |
| import re | |
| from typing import Callable | |
| def _has_any(text: str, keywords: list[str]) -> bool: | |
| """Case-insensitive check: does *text* contain any of *keywords*?""" | |
| low = text.lower() | |
| return any(kw.lower() in low for kw in keywords) | |
| def _has_pattern(text: str, pattern: str) -> bool: | |
| return bool(re.search(pattern, text, re.IGNORECASE)) | |
| def _grade_orders_drop(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]): | |
| score += 0.20 | |
| if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or ( | |
| _has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"]) | |
| ): | |
| score += 0.20 | |
| if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern( | |
| answer, r"march\s*0?1" | |
| ): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "caused", "because", "due to", "result of", "led to", | |
| "when the", "after the", "ending of", "end of the", | |
| "correlated", "explains", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"\d+\s*(orders|transactions)") or _has_pattern( | |
| answer, r"\d+\s*%" | |
| ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_returns_spike(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]): | |
| score += 0.20 | |
| if _has_any(answer, ["west"]): | |
| score += 0.20 | |
| if _has_any(answer, ["audiotech", "audio tech"]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "defect", "defective", "faulty", "quality", | |
| "high return", "return rate", "abnormal", | |
| "stopped working", "battery issue", "poor audio", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"\d+\s*%") or _has_pattern( | |
| answer, r"\d+\s*(returns|returned|units)" | |
| ) or _has_any(answer, ["return rate", "compared to"]): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_customer_churn(answer: str) -> float: | |
| score = 0.0 | |
| if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [ | |
| "decline", "decrease", "drop", "churn", "fewer active", | |
| "lost customers", "stopped ordering", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, ["enterprise"]): | |
| score += 0.20 | |
| if _has_any(answer, ["northeast", "north east", "north-east"]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "price increase", "price change", "price hike", "pricing", | |
| "more expensive", "raised price", "cost increase", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "laptop pro", "desktop workstation", "office suite", | |
| "devtools", "external ssd", | |
| ]) or _has_pattern(answer, r"product.*(1|2|11|15|19)"): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_shipping_delay(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["midwest"]): | |
| score += 0.20 | |
| if _has_any(answer, ["quickship", "quick ship"]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "delivery delay", "late delivery", "delayed shipment", | |
| "shipping delay", "late shipment", "delivery time", | |
| "delayed delivery", "slow delivery", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"feb(ruary)?\s*(10|mid|middle)") or _has_any(answer, [ | |
| "mid-february", "mid february", "around february", | |
| "starting in february", "beginning of february", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "support ticket", "complaint", "ticket volume", | |
| "customer satisfaction", "support request", | |
| ]) and _has_any(answer, [ | |
| "delivery", "shipping", "carrier", "quickship", | |
| ]): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_revenue_paradox(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, [ | |
| "spring mega sale", "mega sale", "25%", "25 percent", | |
| ]) or ( | |
| _has_any(answer, ["promotion", "promo", "discount", "sale"]) | |
| and _has_any(answer, ["margin", "profit", "cost"]) | |
| ): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "product mix", "category mix", "mix shift", "shifted toward", | |
| "higher proportion", "more electronics", "low-margin", | |
| "composition changed", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, ["enterprise"]) and _has_any(answer, [ | |
| "price increase", "price change", "price hike", | |
| "lost", "churn", "left", "fewer", "decline", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, ["return", "refund"]) and _has_any(answer, [ | |
| "cost", "expense", "profit", "margin", "loss", "erode", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern( | |
| answer, r"\d+\s*%" | |
| ) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_supplier_quality(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["audiotech", "audio tech"]): | |
| score += 0.20 | |
| if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]): | |
| score += 0.20 | |
| if _has_any(answer, ["bluetooth speaker", "product 7"]): | |
| score += 0.20 | |
| if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern( | |
| answer, r"\d+\s*%.*return" | |
| ) or _has_pattern(answer, r"return.*\d+\s*%") or _has_pattern( | |
| answer, r"\$\s*[\d,]+" | |
| ): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "support ticket", "defect", "complaint", "product_defect", | |
| "quality issue", "customer complaint", | |
| ]): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_inventory_stockout(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["west"]): | |
| score += 0.20 | |
| if _has_any(answer, ["monitor", "product 4", "monitor 27"]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "inventory", "stock", "out of stock", "stockout", "stock-out", | |
| "zero units", "no inventory", "warehouse", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "spring mega sale", "mega sale", "promo", "promotion", | |
| "february 15", "feb 15", "during the sale", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"\d+\s*(units|orders|sales)") or _has_pattern( | |
| answer, r"\d+\s*%" | |
| ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_fraud_detection(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, ["southeast"]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "new account", "recent signup", "recently created", | |
| "new customer", "account creation", "registered in feb", | |
| "signed up", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "high-value", "high value", "expensive", "laptop pro", | |
| "desktop workstation", "large order", "electronics", | |
| ]): | |
| score += 0.20 | |
| if _has_pattern(answer, r"1[0-5]\s*(account|customer|user)") or _has_pattern( | |
| answer, r"\$\s*[\d,]+" | |
| ) or _has_pattern(answer, r"\d+\s*(order|transaction)"): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "pattern", "cluster", "coordinated", "suspicious", | |
| "same product", "no return", "never returned", | |
| "concentrated", "anomal", "fraud ring", | |
| ]): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| def _grade_repeat_purchase_decline(answer: str) -> float: | |
| score = 0.0 | |
| if _has_any(answer, [ | |
| "repeat purchase", "repeat rate", "returning customer", | |
| "repeat buyer", "repurchase", "order frequency", | |
| "second order", "came back", | |
| ]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [ | |
| "decline", "drop", "decrease", "fell", "collapsed", | |
| ])): | |
| score += 0.20 | |
| if _has_any(answer, ["enterprise"]) and _has_any(answer, [ | |
| "price", "increase", "hike", "stopped", "left", "churn", | |
| ]): | |
| score += 0.20 | |
| if (_has_any(answer, ["midwest"]) or _has_any(answer, [ | |
| "shipping", "delivery", "quickship", | |
| ])) and _has_any(answer, [ | |
| "repeat", "return", "reorder", "come back", "second order", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [ | |
| "retention", "email", "loyalty", "re-engage", "lapsed", | |
| "shifted", "new customer", | |
| ]): | |
| score += 0.20 | |
| if _has_any(answer, [ | |
| "segment", "cohort", "by region", "by segment", | |
| "enterprise vs", "consumer vs", "smb vs", | |
| ]) or _has_pattern(answer, r"(enterprise|smb|consumer).*\d+\s*%"): | |
| score += 0.20 | |
| return min(score, 1.0) | |
| TASKS: dict[str, dict] = { | |
| "orders_drop": { | |
| "id": "orders_drop", | |
| "difficulty": "easy", | |
| "title": "Weekly Orders Drop Investigation", | |
| "description": ( | |
| "URGENT -- Our order volume dropped sharply in the first two weeks " | |
| "of March compared to the last two weeks of February. Leadership " | |
| "needs to know why.\n\n" | |
| "Investigate the database, identify the root cause of the drop, " | |
| "and submit a clear summary of your findings." | |
| ), | |
| }, | |
| "returns_spike": { | |
| "id": "returns_spike", | |
| "difficulty": "medium", | |
| "title": "Product Returns Spike Investigation", | |
| "description": ( | |
| "ALERT -- Our return rate has spiked significantly in recent weeks, " | |
| "with particular concentration in one geographic region. This is " | |
| "eating into margins.\n\n" | |
| "Use the database to identify which product(s) are driving the " | |
| "spike, which region is most affected, and what the likely root " | |
| "cause is. Include the supplier if relevant." | |
| ), | |
| }, | |
| "customer_churn": { | |
| "id": "customer_churn", | |
| "difficulty": "hard", | |
| "title": "Customer Churn Root Cause Analysis", | |
| "description": ( | |
| "CRITICAL -- Our monthly active customer count has declined " | |
| "significantly from January to March. The executive team wants a " | |
| "full root-cause analysis.\n\n" | |
| "Determine which customer segments and regions are most affected, " | |
| "quantify the decline, and identify the most likely causes. " | |
| "Check all available tables for clues." | |
| ), | |
| }, | |
| "shipping_delay": { | |
| "id": "shipping_delay", | |
| "difficulty": "medium-hard", | |
| "title": "Customer Satisfaction Crisis Investigation", | |
| "description": ( | |
| "ESCALATION -- Customer satisfaction scores have plummeted in one " | |
| "of our regions. The support team is overwhelmed with complaints " | |
| "and escalations are piling up.\n\n" | |
| "Investigate what operational issue is driving the complaints, " | |
| "identify the responsible party (carrier, warehouse, etc.), " | |
| "determine when the problem started, and quantify the impact. " | |
| "Cross-reference multiple data sources for a complete picture." | |
| ), | |
| }, | |
| "revenue_paradox": { | |
| "id": "revenue_paradox", | |
| "difficulty": "hard", | |
| "title": "Revenue vs. Profit Paradox Investigation", | |
| "description": ( | |
| "CRITICAL -- Revenue in February was our highest month ever, yet " | |
| "gross profit actually *decreased* compared to January. The CFO " | |
| "wants a full breakdown of why we are selling more but earning " | |
| "less.\n\n" | |
| "Analyze revenue, costs, margins, discounts, product mix, customer " | |
| "segments, and any other relevant factors. This is likely multi-" | |
| "causal -- identify ALL contributing factors and quantify their " | |
| "impact. Use the products.cost column to compute margins." | |
| ), | |
| }, | |
| "supplier_quality": { | |
| "id": "supplier_quality", | |
| "difficulty": "medium", | |
| "title": "Supplier Quality Crisis Investigation", | |
| "description": ( | |
| "ESCALATION -- The VP of Merchandising has received escalating " | |
| "complaints about product quality across multiple SKUs. Quality " | |
| "Assurance wants a supplier-level analysis.\n\n" | |
| "Determine which supplier(s) have systemic quality issues, which " | |
| "of their products are affected, and quantify the total business " | |
| "impact in returns, refunds, and support ticket volume. Include " | |
| "return rates by supplier to support a contract renegotiation." | |
| ), | |
| }, | |
| "inventory_stockout": { | |
| "id": "inventory_stockout", | |
| "difficulty": "medium-hard", | |
| "title": "Regional Sales Underperformance Investigation", | |
| "description": ( | |
| "INVESTIGATION -- Our West region was projected to be the top " | |
| "performer during the Spring Mega Sale based on historical trends " | |
| "and marketing investment, but actual sales came in significantly " | |
| "below the other regions.\n\n" | |
| "The Regional VP demands an explanation. Investigate what caused " | |
| "the West to underperform during our biggest promotional event. " | |
| "Check product-level sales, inventory data, and any operational " | |
| "issues that may have limited fulfillment." | |
| ), | |
| }, | |
| "fraud_detection": { | |
| "id": "fraud_detection", | |
| "difficulty": "hard", | |
| "title": "Suspicious Order Pattern Investigation", | |
| "description": ( | |
| "ALERT -- The Finance team has flagged a suspicious spike in " | |
| "high-value orders from recently created accounts. Several of " | |
| "these orders have already shipped.\n\n" | |
| "Investigate the pattern: identify the suspicious accounts, " | |
| "determine the scope of potential fraud, estimate the financial " | |
| "exposure, and describe the behavioral signatures that " | |
| "distinguish these accounts from legitimate customers. Look at " | |
| "signup dates, order values, product choices, and geographic " | |
| "concentration." | |
| ), | |
| }, | |
| "repeat_purchase_decline": { | |
| "id": "repeat_purchase_decline", | |
| "difficulty": "hard", | |
| "title": "Customer Retention Crisis Investigation", | |
| "description": ( | |
| "CRITICAL -- Monthly unique buyer count has held steady around " | |
| "100, but the Customer Success team reports that repeat purchase " | |
| "rates have collapsed. In January, roughly 40%% of orders came " | |
| "from returning customers; by March, it appears to be under 20%%." | |
| "\n\n" | |
| "The CEO asks: are we becoming a one-time-purchase business? " | |
| "Diagnose which customer segments and regions lost repeat buyers, " | |
| "identify the root causes, and determine whether our marketing " | |
| "spend strategy is masking a retention problem. Check the " | |
| "marketing_spend table for clues about acquisition vs. retention " | |
| "investment." | |
| ), | |
| }, | |
| } | |
| _GRADERS: dict[str, Callable[[str], float]] = { | |
| "orders_drop": _grade_orders_drop, | |
| "returns_spike": _grade_returns_spike, | |
| "customer_churn": _grade_customer_churn, | |
| "shipping_delay": _grade_shipping_delay, | |
| "revenue_paradox": _grade_revenue_paradox, | |
| "supplier_quality": _grade_supplier_quality, | |
| "inventory_stockout": _grade_inventory_stockout, | |
| "fraud_detection": _grade_fraud_detection, | |
| "repeat_purchase_decline": _grade_repeat_purchase_decline, | |
| } | |
| def grade_answer(task_id: str, answer: str) -> float: | |
| grader = _GRADERS.get(task_id) | |
| if grader is None: | |
| return 0.05 | |
| raw = grader(answer) | |
| return max(0.05, min(0.95, raw)) | |