dataops-env / task.py
Graheet
Update DataOps env: semantic contract, inference, server, and docs polish
567645a
"""Task definitions for ``dataops-gym``.
This module defines the benchmark scenarios used by the OpenEnv environment.
Each public task family keeps the hackathon-facing `easy` / `medium` / `hard`
shape while internally supporting deterministic variants so the benchmark is
broader and less gameable.
"""
from __future__ import annotations
from copy import deepcopy
from typing import Any, Dict, List, TypedDict
TableRow = Dict[str, Any]
class TaskDefinition(TypedDict):
"""Typed structure returned by task factory functions."""
initial_table: List[TableRow]
hidden_issues: List[Dict[str, Any]]
constraints: List[str]
max_steps: int
goal: str
difficulty: str
required_columns: List[str]
expected_outcome: Dict[str, Any]
variant_id: str
class HiddenIssue(TypedDict, total=False):
"""Structured hidden issue description for a task table."""
type: str
rows: List[int]
row: int
column: str
constraint: str
description: str
def _pick_variant(variant: int | None, variants: List[TaskDefinition]) -> TaskDefinition:
"""Select a deterministic task variant with a stable default."""
index = 0 if variant is None else max(0, min(len(variants) - 1, int(variant)))
selected = deepcopy(variants[index])
selected["hidden_issues"] = _with_fixable_flags(selected.get("hidden_issues", []))
return selected
def _with_fixable_flags(hidden_issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Ensure each hidden issue carries an explicit ``fixable`` flag."""
enriched: List[Dict[str, Any]] = []
for issue in hidden_issues:
issue_copy = dict(issue)
if "fixable" not in issue_copy:
issue_type = issue_copy.get("type")
# Structural conflicts usually require judgment across rows.
issue_copy["fixable"] = issue_type not in {"duplicate", "conflict", "constraint_violation"}
enriched.append(issue_copy)
return enriched
def easy_cleaning_task(variant: int | None = None) -> TaskDefinition:
"""Create an easy multi-step cleaning task with duplicates and missing data."""
variants: List[TaskDefinition] = [
{
"goal": "Clean the dataset by removing duplicates and filling missing values.",
"difficulty": "easy",
"variant_id": "easy_customer_master",
"required_columns": ["name", "city", "email"],
"expected_outcome": {
"expected_row_count": 4,
"required_non_null_columns": ["name", "city", "email"],
"unique_by": ["customer_id"],
"exactly_one_of_rows": [[2, 3]],
"validation_rules": [
"Exactly one of rows 2 or 3 should remain after deduplication.",
"No remaining row should have null values in name, city, or email.",
"All remaining customer_id values should be unique.",
],
},
"initial_table": [
{"row_id": 1, "customer_id": "C001", "name": "Alice Wong", "city": "Seattle", "email": "alice@example.com"},
{"row_id": 2, "customer_id": "C002", "name": "Ben Ortiz", "city": None, "email": "ben@example.com"},
{"row_id": 3, "customer_id": "C002", "name": "Ben Ortiz", "city": None, "email": "ben@example.com"},
{"row_id": 4, "customer_id": "C003", "name": "Carla Singh", "city": "Austin", "email": None},
{"row_id": 5, "customer_id": "C004", "name": "Drew Park", "city": "Boston", "email": "drew@example.com"},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [2, 3],
"description": "Rows 2 and 3 are duplicates and only one should remain.",
},
{
"type": "missing_value",
"row": 2,
"column": "city",
"description": "Row 2 is missing a required city value.",
},
{
"type": "missing_value",
"row": 4,
"column": "email",
"description": "Row 4 is missing a required email value.",
},
],
"constraints": [
"Keep one representative row for each real customer.",
"Do not delete rows solely because they contain missing values.",
"Name, city, and email must be populated for every remaining row.",
],
"max_steps": 7,
},
{
"goal": "Clean the dataset by removing duplicates and filling missing values.",
"difficulty": "easy",
"variant_id": "easy_vendor_onboarding",
"required_columns": ["name", "city", "email"],
"expected_outcome": {
"expected_row_count": 4,
"required_non_null_columns": ["name", "city", "email"],
"unique_by": ["vendor_id"],
"exactly_one_of_rows": [[32, 33]],
"validation_rules": [
"Exactly one of rows 32 or 33 should remain after deduplication.",
"No remaining row should have null values in name, city, or email.",
"All remaining vendor_id values should be unique.",
],
},
"initial_table": [
{"row_id": 31, "vendor_id": "V001", "name": "Northwind Foods", "city": "Denver", "email": "ops@northwind.example.com"},
{"row_id": 32, "vendor_id": "V002", "name": "Blue Harbor Ltd", "city": "Miami", "email": "contact@blueharbor.example.com"},
{"row_id": 33, "vendor_id": "V002", "name": "Blue Harbor Ltd", "city": "Miami", "email": "contact@blueharbor.example.com"},
{"row_id": 34, "vendor_id": "V003", "name": "Atlas Office Supply", "city": None, "email": "service@atlas.example.com"},
{"row_id": 35, "vendor_id": "V004", "name": "Peak Systems", "city": "Portland", "email": None},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [32, 33],
"description": "Rows 32 and 33 are duplicates and only one should remain.",
},
{
"type": "missing_value",
"row": 34,
"column": "city",
"description": "Row 34 is missing a required city value.",
},
{
"type": "missing_value",
"row": 35,
"column": "email",
"description": "Row 35 is missing a required email value.",
},
],
"constraints": [
"Keep one representative row for each real vendor.",
"Do not delete rows solely because they contain missing values.",
"Name, city, and email must be populated for every remaining row.",
],
"max_steps": 7,
},
]
return _pick_variant(variant, variants)
def medium_normalization_task(variant: int | None = None) -> TaskDefinition:
"""Create a medium multi-step normalization task with several issue types."""
variants: List[TaskDefinition] = [
{
"goal": "Normalize the dataset by fixing casing, removing duplicates, and correcting invalid email formats.",
"difficulty": "medium",
"variant_id": "medium_customer_normalization",
"required_columns": ["name", "city", "email"],
"expected_outcome": {
"expected_row_count": 5,
"required_non_null_columns": ["name", "city", "email"],
"unique_by": ["customer_id"],
"normalized_columns": {"name": "title_case", "city": "title_case"},
"format_rules": {"email": "valid_email"},
"exactly_one_of_rows": [[11, 13]],
"validation_rules": [
"Exactly one of rows 11 or 13 should remain after deduplication.",
"All remaining emails should satisfy a valid email format.",
"Names and cities should follow a consistent human-readable casing convention.",
"All remaining customer_id values should be unique.",
],
},
"initial_table": [
{"row_id": 10, "customer_id": "C100", "name": "jane miller", "city": "new york", "email": "jane.miller@example.com"},
{"row_id": 11, "customer_id": "C101", "name": "OMAR HASSAN", "city": "CHICAGO", "email": "omar.hassan[at]example.com"},
{"row_id": 12, "customer_id": "C102", "name": "Priya Nair", "city": "San Jose", "email": "priya.nair@example.com"},
{"row_id": 13, "customer_id": "C101", "name": "OMAR HASSAN", "city": "CHICAGO", "email": "omar.hassan[at]example.com"},
{"row_id": 14, "customer_id": "C103", "name": "li wei", "city": "seattle", "email": "li.wei.example.com"},
{"row_id": 15, "customer_id": "C104", "name": "Maria Gomez", "city": "Austin", "email": "maria.gomez@example.com"},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [11, 13],
"description": "Rows 11 and 13 are duplicates and only one should remain.",
},
{
"type": "inconsistent_casing",
"rows": [10, 11, 14],
"column": "name",
"description": "Rows 10, 11, and 14 contain inconsistent casing in names.",
},
{
"type": "inconsistent_casing",
"rows": [10, 11, 14],
"column": "city",
"description": "Rows 10, 11, and 14 contain inconsistent casing in cities.",
},
{
"type": "invalid_format",
"row": 11,
"column": "email",
"description": "Row 11 contains an invalid email format.",
},
{
"type": "invalid_format",
"row": 14,
"column": "email",
"description": "Row 14 contains an invalid email format.",
},
],
"constraints": [
"Preserve the original entity identity of each remaining row.",
"Normalize names and cities to a consistent human-readable casing style.",
"Only repair emails that are actually invalid.",
"Do not introduce new duplicates while normalizing values.",
],
"max_steps": 9,
},
{
"goal": "Normalize the dataset by fixing casing, removing duplicates, and correcting invalid email formats.",
"difficulty": "medium",
"variant_id": "medium_partner_directory",
"required_columns": ["name", "city", "email"],
"expected_outcome": {
"expected_row_count": 5,
"required_non_null_columns": ["name", "city", "email"],
"unique_by": ["partner_id"],
"normalized_columns": {"name": "title_case", "city": "title_case"},
"format_rules": {"email": "valid_email"},
"exactly_one_of_rows": [[41, 43]],
"validation_rules": [
"Exactly one of rows 41 or 43 should remain after deduplication.",
"All remaining emails should satisfy a valid email format.",
"Names and cities should use consistent title case.",
"All remaining partner_id values should be unique.",
],
},
"initial_table": [
{"row_id": 40, "partner_id": "P100", "name": "delta analytics", "city": "san francisco", "email": "hello@delta.example.com"},
{"row_id": 41, "partner_id": "P101", "name": "LUCIA ROMERO", "city": "MADRID", "email": "lucia.romero at example.com"},
{"row_id": 42, "partner_id": "P102", "name": "Ken Ito", "city": "Tokyo", "email": "ken.ito@example.com"},
{"row_id": 43, "partner_id": "P101", "name": "LUCIA ROMERO", "city": "MADRID", "email": "lucia.romero at example.com"},
{"row_id": 44, "partner_id": "P103", "name": "amina ali", "city": "dubai", "email": "amina.ali.example.com"},
{"row_id": 45, "partner_id": "P104", "name": "Sofia Hart", "city": "London", "email": "sofia.hart@example.com"},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [41, 43],
"description": "Rows 41 and 43 are duplicates and only one should remain.",
},
{
"type": "inconsistent_casing",
"rows": [40, 41, 44],
"column": "name",
"description": "Rows 40, 41, and 44 contain inconsistent casing in names.",
},
{
"type": "inconsistent_casing",
"rows": [40, 41, 44],
"column": "city",
"description": "Rows 40, 41, and 44 contain inconsistent casing in cities.",
},
{
"type": "invalid_format",
"row": 41,
"column": "email",
"description": "Row 41 contains an invalid email format.",
},
{
"type": "invalid_format",
"row": 44,
"column": "email",
"description": "Row 44 contains an invalid email format.",
},
],
"constraints": [
"Preserve the original entity identity of each remaining row.",
"Normalize names and cities to a consistent human-readable casing style.",
"Only repair emails that are actually invalid.",
"Do not introduce new duplicates while normalizing values.",
],
"max_steps": 9,
},
]
return _pick_variant(variant, variants)
def hard_conflict_resolution_task(variant: int | None = None) -> TaskDefinition:
"""Create a hard multi-step conflict-resolution task with deceptive records."""
variants: List[TaskDefinition] = [
{
"goal": "Resolve conflicting records, enforce unique email constraints, fix invalid formats, and preserve valid but unusual data.",
"difficulty": "hard",
"variant_id": "hard_customer_conflicts",
"required_columns": ["name", "email", "phone", "status"],
"expected_outcome": {
"expected_row_count_range": {"min": 5, "max": 6},
"unique_by": ["email"],
"format_rules": {"email": "valid_email", "phone": "normalized_phone"},
"exactly_one_of_rows": [[21, 22], [23, 24], [26, 27]],
"must_preserve_valid_rows": [25, 28],
"validation_rules": [
"Exactly one of rows 21 or 22 should remain after deduplication.",
"Exactly one of rows 23 or 24 should remain after conflict resolution.",
"Exactly one of rows 26 or 27 should remain after enforcing email uniqueness.",
"No two remaining rows should share the same email address.",
"All remaining emails should satisfy a valid email format.",
"All remaining phone values should be normalized to a consistent valid format.",
"Rows 25 and 28 should remain logically unchanged because they are valid trap rows.",
],
},
"initial_table": [
{"row_id": 21, "customer_id": "C200", "name": "Nina Patel", "email": "nina.patel@example.com", "phone": "206-555-0101", "status": "active"},
{"row_id": 22, "customer_id": "C200", "name": "Nina Patel", "email": "nina.patel@example.com", "phone": "206-555-0101", "status": "active"},
{"row_id": 23, "customer_id": "C201", "name": "Evan Cole", "email": "evan.cole@example", "phone": "4155550102", "status": "active", "age": 250},
{"row_id": 24, "customer_id": "C201", "name": "Evan Cole", "email": "evan.cole@example.com", "phone": "(415) 555-0102", "status": "inactive", "age": 45},
{"row_id": 25, "customer_id": "C202", "name": "A. J. Brown", "email": "aj.brown@example.com", "phone": "+1-312-555-0103", "status": "active"},
{"row_id": 26, "customer_id": "C203", "name": "Marta Silva", "email": "shared@example.com", "phone": "646-555-0104", "status": "active"},
{"row_id": 27, "customer_id": "C204", "name": "Martin Silva", "email": "shared@example.com", "phone": "646-555-0105", "status": "active"},
{"row_id": 28, "customer_id": "C205", "name": "Q Xu", "email": "q.xu+vip@example.com", "phone": "917-555-0106", "status": "active"},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [21, 22],
"description": "Rows 21 and 22 are exact duplicates and only one should remain.",
},
{
"type": "conflict",
"rows": [23, 24],
"field": "age",
"values": [250, 45],
"fixable": False,
"description": "Rows 23 and 24 conflict for the same customer and must be reconciled into one trustworthy record.",
},
{
"type": "invalid_format",
"row": 23,
"column": "email",
"description": "Row 23 contains an invalid email format.",
},
{
"type": "invalid_format",
"row": 23,
"column": "phone",
"description": "Row 23 contains an invalid phone format.",
},
{
"type": "constraint_violation",
"constraint": "unique_email",
"rows": [26, 27],
"description": "Rows 26 and 27 violate the unique email constraint.",
},
{
"type": "valid_trap",
"row": 28,
"description": "Row 28 is valid even though the plus-address format may look suspicious.",
},
{
"type": "valid_trap",
"row": 25,
"description": "Row 25 is valid even though the name abbreviation may look inconsistent.",
},
],
"constraints": [
"Email values must be unique across the final table.",
"Every remaining row must represent a single coherent customer record.",
"Do not modify valid rows just because they look unusual.",
"Prefer correction and conflict resolution over unnecessary deletion.",
],
"max_steps": 14,
},
{
"goal": "Resolve conflicting records, enforce unique email constraints, fix invalid formats, and preserve valid but unusual data.",
"difficulty": "hard",
"variant_id": "hard_account_merges",
"required_columns": ["name", "email", "phone", "status"],
"expected_outcome": {
"expected_row_count_range": {"min": 5, "max": 6},
"unique_by": ["email"],
"format_rules": {"email": "valid_email", "phone": "normalized_phone"},
"exactly_one_of_rows": [[51, 52], [53, 54], [56, 57]],
"must_preserve_valid_rows": [55, 58],
"validation_rules": [
"Exactly one of rows 51 or 52 should remain after deduplication.",
"Exactly one of rows 53 or 54 should remain after conflict resolution.",
"Exactly one of rows 56 or 57 should remain after enforcing email uniqueness.",
"No two remaining rows should share the same email address.",
"All remaining emails should satisfy a valid email format.",
"All remaining phone values should be normalized to a consistent valid format.",
"Rows 55 and 58 should remain logically unchanged because they are valid trap rows.",
],
},
"initial_table": [
{"row_id": 51, "customer_id": "A900", "name": "Lena Brooks", "email": "lena.brooks@example.com", "phone": "212-555-0111", "status": "active"},
{"row_id": 52, "customer_id": "A900", "name": "Lena Brooks", "email": "lena.brooks@example.com", "phone": "212-555-0111", "status": "active"},
{"row_id": 53, "customer_id": "A901", "name": "Ravi Shah", "email": "ravi.shah example.com", "phone": "6465550112", "status": "active", "age": 250},
{"row_id": 54, "customer_id": "A901", "name": "Ravi Shah", "email": "ravi.shah@example.com", "phone": "646-555-0112", "status": "inactive", "age": 45},
{"row_id": 55, "customer_id": "A902", "name": "M. E. Klein", "email": "mek@example.com", "phone": "+1-303-555-0113", "status": "active"},
{"row_id": 56, "customer_id": "A903", "name": "Sana Noor", "email": "ops@example.com", "phone": "718-555-0114", "status": "active"},
{"row_id": 57, "customer_id": "A904", "name": "Sana N.", "email": "ops@example.com", "phone": "718-555-0115", "status": "active"},
{"row_id": 58, "customer_id": "A905", "name": "Bo Li", "email": "bo.li+archive@example.com", "phone": "415-555-0116", "status": "active"},
],
"hidden_issues": [
{
"type": "duplicate",
"rows": [51, 52],
"description": "Rows 51 and 52 are exact duplicates and only one should remain.",
},
{
"type": "conflict",
"rows": [53, 54],
"field": "age",
"values": [250, 45],
"fixable": False,
"description": "Rows 53 and 54 conflict for the same customer and must be reconciled into one trustworthy record.",
},
{
"type": "invalid_format",
"row": 53,
"column": "email",
"description": "Row 53 contains an invalid email format.",
},
{
"type": "invalid_format",
"row": 53,
"column": "phone",
"description": "Row 53 contains an invalid phone format.",
},
{
"type": "constraint_violation",
"constraint": "unique_email",
"rows": [56, 57],
"description": "Rows 56 and 57 violate the unique email constraint.",
},
{
"type": "valid_trap",
"row": 55,
"description": "Row 55 is valid even though the abbreviated name may look unusual.",
},
{
"type": "valid_trap",
"row": 58,
"description": "Row 58 is valid even though the plus-address format may look suspicious.",
},
],
"constraints": [
"Email values must be unique across the final table.",
"Every remaining row must represent a single coherent customer record.",
"Do not modify valid rows just because they look unusual.",
"Prefer correction and conflict resolution over unnecessary deletion.",
],
"max_steps": 14,
},
]
return _pick_variant(variant, variants)
easy_cleaning_task.variant_count = 2
medium_normalization_task.variant_count = 2
hard_conflict_resolution_task.variant_count = 2