sql-migration-env / seeds.py
Eishaan's picture
fixed metrics
874e22a
"""
Deterministic Adversarial Seed Data Engine.
> **Hackathon Judges Note:**
> This is not generic dummy data. Our seeds specifically inject malicious
> real-world SQL edge cases to pressure-test frontier LLM logic:
> - **O'Brien (Task 1):** Tests if the agent uses proper parameterization/escaping.
> - **Duplicate Emails (Task 2):** Tests `DISTINCT` vs standard `INSERT` logic.
> - **Orphaned FKs (Task 3):** Tests the agent's ability to safely `CASCADE` or audit-log invalid relations before dropping columns.
> - **NULL salary rows (Task 3):** Tests strict type constraints handling.
EVERY value in this file is a hardcoded constant. No datetime.now(),
no random(), no runtime generation. This guarantees deterministic
grader behavior across every execution.
"""
import sqlite3
from typing import Dict, List, Tuple
# =============================================================================
# TASK 1: Column Restructure (Easy)
# =============================================================================
# Agent must merge first_name + last_name into full_name without data loss.
# Adversarial: O'Brien (apostrophe), McDonald (capital mid-word).
TASK1_SOURCE_DDL = """
CREATE TABLE users (
id INTEGER PRIMARY KEY,
first_name TEXT NOT NULL,
last_name TEXT NOT NULL
);
"""
TASK1_SOURCE_DATA = [
(1, "John", "O'Brien"),
(2, "Mary", "McDonald"),
(3, "Alice", "Smith"),
(4, "Bob", "Jones"),
(5, "Carol", "White"),
]
TASK1_TARGET_DDL = """CREATE TABLE users (
id INTEGER PRIMARY KEY,
full_name TEXT NOT NULL
);"""
TASK1_EXPECTED_ROWS: List[Tuple] = [
(1, "John O'Brien"),
(2, "Mary McDonald"),
(3, "Alice Smith"),
(4, "Bob Jones"),
(5, "Carol White"),
]
def seed_task1(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 1: Column Restructure."""
conn.executescript(TASK1_SOURCE_DDL)
conn.executemany(
"INSERT INTO users (id, first_name, last_name) VALUES (?, ?, ?)",
TASK1_SOURCE_DATA,
)
conn.commit()
# =============================================================================
# TASK 2: Table Normalization (Medium)
# =============================================================================
# Agent must split flat purchases table into customers + orders with FK.
# Adversarial: alice@example.com appears 3 times (forces SELECT DISTINCT),
# "Laptop, 15-inch" has a comma (breaks naive CSV parsing).
TASK2_SOURCE_DDL = """
CREATE TABLE purchases (
purchase_id INTEGER PRIMARY KEY,
item_name TEXT NOT NULL,
price INTEGER NOT NULL,
customer_name TEXT NOT NULL,
customer_email TEXT NOT NULL
);
"""
TASK2_SOURCE_DATA = [
(1, "Laptop, 15-inch", 80000, "Alice Smith", "alice@example.com"),
(2, "Mouse", 2500, "Bob Jones", "bob@example.com"),
(3, "Keyboard", 4500, "Alice Smith", "alice@example.com"),
(4, "Monitor", 25000, "Carol White", "carol@example.com"),
(5, "Webcam", 3500, "Alice Smith", "alice@example.com"),
(6, "USB Hub", 1500, "Bob Jones", "bob@example.com"),
(7, "Headphones", 6000, "Carol White", "carol@example.com"),
]
TASK2_TARGET_DDL = """CREATE TABLE customers (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE
);
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL,
item_name TEXT NOT NULL,
price INTEGER NOT NULL,
FOREIGN KEY (customer_id) REFERENCES customers(id)
);"""
TASK2_EXPECTED_CUSTOMER_COUNT = 3
TASK2_EXPECTED_ORDER_COUNT = 7
def seed_task2(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 2: Table Normalization."""
conn.executescript(TASK2_SOURCE_DDL)
conn.executemany(
"INSERT INTO purchases (purchase_id, item_name, price, customer_name, customer_email) "
"VALUES (?, ?, ?, ?, ?)",
TASK2_SOURCE_DATA,
)
conn.commit()
# =============================================================================
# TASK 3: Cascade Migration (Hard)
# =============================================================================
# Agent must fix types, enforce FKs, and handle orphaned/NULL records.
# Adversarial: salary as "$50,000" strings, one NULL salary,
# two orphaned assets referencing nonexistent employees.
TASK3_SOURCE_DDL = """
CREATE TABLE companies (
id INTEGER PRIMARY KEY,
name TEXT
);
CREATE TABLE departments (
id INTEGER PRIMARY KEY,
company_id INTEGER,
name TEXT
);
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
department_id INTEGER,
name TEXT,
salary TEXT
);
CREATE TABLE assets (
id INTEGER PRIMARY KEY,
employee_id INTEGER,
description TEXT
);
"""
TASK3_COMPANIES_DATA = [
(1, "Acme Corp"),
(2, "Globex Inc"),
]
TASK3_DEPARTMENTS_DATA = [
(1, 1, "Engineering"),
(2, 1, "Marketing"),
(3, 2, "Sales"),
]
TASK3_EMPLOYEES_DATA = [
(1, 1, "Dave Kumar", "$90000"),
(2, 1, "Eve Chen", "$75000"),
(3, 2, "Frank O'Neill", "$60000"),
(4, 3, "Grace Lee", "$85000"),
(5, 3, "Hal Patel", None), # NULL salary — violates target NOT NULL
]
TASK3_ASSETS_DATA = [
(1, 1, "MacBook Pro"),
(2, 2, "Dell Monitor"),
(3, 3, "Standing Desk"),
(4, 99, "Orphaned Laptop"), # employee_id=99 does not exist
(5, 100, "Orphaned Chair"), # employee_id=100 does not exist
]
TASK3_TARGET_DDL = """CREATE TABLE companies (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL
);
CREATE TABLE departments (
id INTEGER PRIMARY KEY,
company_id INTEGER NOT NULL,
name TEXT NOT NULL,
FOREIGN KEY (company_id) REFERENCES companies(id)
);
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
department_id INTEGER NOT NULL,
name TEXT NOT NULL,
salary INTEGER NOT NULL,
FOREIGN KEY (department_id) REFERENCES departments(id)
);
CREATE TABLE assets (
id INTEGER PRIMARY KEY,
employee_id INTEGER NOT NULL,
description TEXT NOT NULL,
FOREIGN KEY (employee_id) REFERENCES employees(id)
);
CREATE TABLE audit_log (
id INTEGER PRIMARY KEY,
source_table TEXT NOT NULL,
original_row_json TEXT NOT NULL,
reason TEXT NOT NULL
);"""
# Expected audit_log entries: 1 NULL salary employee + 2 orphaned assets = 3 rows
TASK3_EXPECTED_AUDIT_COUNT = 3
TASK3_EXPECTED_AUDIT_ENTRIES = [
("employees", "null_salary"),
("assets", "orphaned_record"),
("assets", "orphaned_record"),
]
# Expected employee salaries after migration (Hal Patel removed)
TASK3_EXPECTED_SALARIES: Dict[int, int] = {
1: 90000,
2: 75000,
3: 60000,
4: 85000,
}
TASK3_EXPECTED_EMPLOYEE_COUNT = 4
def seed_task3(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 3: Cascade Migration."""
conn.executescript(TASK3_SOURCE_DDL)
conn.executemany(
"INSERT INTO companies (id, name) VALUES (?, ?)",
TASK3_COMPANIES_DATA,
)
conn.executemany(
"INSERT INTO departments (id, company_id, name) VALUES (?, ?, ?)",
TASK3_DEPARTMENTS_DATA,
)
conn.executemany(
"INSERT INTO employees (id, department_id, name, salary) VALUES (?, ?, ?, ?)",
TASK3_EMPLOYEES_DATA,
)
conn.executemany(
"INSERT INTO assets (id, employee_id, description) VALUES (?, ?, ?)",
TASK3_ASSETS_DATA,
)
conn.commit()
# =============================================================================
# TASK 4: Soft-Delete Restoration (Easy)
# =============================================================================
# Agent must restore deleted products from a deletion_log, add is_deleted/deleted_at columns.
# Adversarial: "O'Brien Desk" (apostrophe), stock=0 on Webcam (must NOT confuse with is_deleted).
TASK4_SOURCE_DDL = """
CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
price REAL NOT NULL,
stock INTEGER NOT NULL
);
CREATE TABLE deletion_log (
id INTEGER PRIMARY KEY,
product_id INTEGER NOT NULL,
product_name TEXT NOT NULL,
product_price REAL NOT NULL,
product_stock INTEGER NOT NULL,
deleted_at TEXT NOT NULL
);
"""
TASK4_PRODUCTS_DATA = [
(1, "Laptop", 999.99, 15),
(2, "O'Brien Desk", 249.99, 8),
(3, "Monitor", 399.99, 23),
(4, "Keyboard", 89.99, 45),
(5, "Mouse", 29.99, 102),
]
TASK4_DELETION_LOG_DATA = [
(1, 6, "Headphones", 149.99, 30, "2024-01-15"),
(2, 7, "Webcam", 79.99, 0, "2024-02-20"), # stock=0 but NOT is_deleted=1
(3, 8, "USB-C Hub", 49.99, 12, "2024-03-10"),
]
TASK4_TARGET_DDL = """CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
price REAL NOT NULL,
stock INTEGER NOT NULL,
is_deleted INTEGER NOT NULL DEFAULT 0,
deleted_at TEXT
);"""
TASK4_EXPECTED_ROW_COUNT = 8
TASK4_EXPECTED_ID_SUM = 36 # 1+2+3+4+5+6+7+8
TASK4_EXPECTED_DELETED_COUNT = 3 # ids 6,7,8
TASK4_EXPECTED_ACTIVE_COUNT = 5 # ids 1-5
def seed_task4(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 4: Soft-Delete Restoration."""
conn.executescript(TASK4_SOURCE_DDL)
conn.executemany(
"INSERT INTO products (id, name, price, stock) VALUES (?, ?, ?, ?)",
TASK4_PRODUCTS_DATA,
)
conn.executemany(
"INSERT INTO deletion_log (id, product_id, product_name, product_price, product_stock, deleted_at) "
"VALUES (?, ?, ?, ?, ?, ?)",
TASK4_DELETION_LOG_DATA,
)
conn.commit()
# =============================================================================
# TASK 5: Schema Version Merge (Medium)
# =============================================================================
# Agent must merge products_v1 (price as "$XX.XX" TEXT) and products_v2 (price as REAL)
# into a single products table. v2 wins on ID conflicts. Must add source column.
# Adversarial: id=101 high ID, NULL category, "$" price coercion, conflicting rows.
TASK5_SOURCE_DDL = """
CREATE TABLE products_v1 (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
price TEXT NOT NULL,
category TEXT,
supplier TEXT
);
CREATE TABLE products_v2 (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
unit_cost REAL NOT NULL,
category TEXT NOT NULL,
brand TEXT,
sku TEXT
);
"""
TASK5_V1_DATA = [
(1, "Widget A", "$12.50", "Electronics", "AcmeCo"),
(2, "Widget B", "$8.99", "Electronics", "AcmeCo"),
(3, "Gadget X", "$45.00", None, "TechCorp"),
(4, "Gadget Y", "$32.50", "Tools", "TechCorp"),
(5, "Doohickey", "$5.99", "Office", "SupplyPro"),
(101, "Legacy Item", "$99.99", "Electronics", "OldCo"),
]
TASK5_V2_DATA = [
(1, "Widget A", 12.50, "Electronics", "AcmeCo", "SKU-001"),
(2, "Widget B Updated", 9.99, "Electronics", "AcmeCo", "SKU-002"),
(6, "New Product F", 67.00, "Tools", "NewCorp", "SKU-006"),
(7, "New Product G", 23.50, "Office", "NewCorp", "SKU-007"),
(8, "New Product H", 11.00, "Electronics", "ImportCo","SKU-008"),
]
TASK5_TARGET_DDL = """CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
price REAL NOT NULL,
category TEXT,
supplier TEXT,
brand TEXT,
sku TEXT,
source TEXT NOT NULL
);"""
TASK5_EXPECTED_ROW_COUNT = 9
TASK5_EXPECTED_PRICE_SUM = round(12.50 + 9.99 + 45.00 + 32.50 + 5.99 + 99.99 + 67.00 + 23.50 + 11.00, 2)
TASK5_EXPECTED_BOTH_SOURCE_COUNT = 2 # ids 1 and 2
def seed_task5(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 5: Schema Version Merge."""
conn.executescript(TASK5_SOURCE_DDL)
conn.executemany(
"INSERT INTO products_v1 (id, name, price, category, supplier) VALUES (?, ?, ?, ?, ?)",
TASK5_V1_DATA,
)
conn.executemany(
"INSERT INTO products_v2 (id, name, unit_cost, category, brand, sku) VALUES (?, ?, ?, ?, ?, ?)",
TASK5_V2_DATA,
)
conn.commit()
# =============================================================================
# TASK 6: Multi-Entity Extraction (Medium — Hard End)
# =============================================================================
# Agent must decompose a sales_records god-table into 3NF (5 tables).
# Adversarial: leading whitespace email, empty customer email, comma in SKU.
TASK6_SOURCE_DDL = """
CREATE TABLE sales_records (
id INTEGER PRIMARY KEY,
rep_name TEXT NOT NULL,
rep_email TEXT NOT NULL,
rep_region TEXT NOT NULL,
customer_name TEXT NOT NULL,
customer_email TEXT NOT NULL,
customer_tier TEXT NOT NULL,
product_name TEXT NOT NULL,
product_sku TEXT NOT NULL,
product_category TEXT NOT NULL,
quantity INTEGER NOT NULL,
unit_price REAL NOT NULL,
discount_pct INTEGER NOT NULL DEFAULT 0,
sale_date TEXT NOT NULL
);
"""
TASK6_SOURCE_DATA = [
(1, "Alice Chen", " alice@company.com", "North", "Globex Corp", "globex@corp.com", "enterprise", "Widget Pro", "WIDGET-001", "Electronics", 5, 299.99, 10, "2024-01-10"),
(2, "Alice Chen", "alice@company.com", "North", "Initech LLC", "info@initech.com", "basic", "Widget Pro", "WIDGET-001", "Electronics", 2, 299.99, 0, "2024-01-15"),
(3, "Bob Martinez", "bob@company.com", "South", "Globex Corp", "globex@corp.com", "enterprise", "Gadget X", "GADGET-X01", "Hardware", 10, 89.99, 5, "2024-01-20"),
(4, "Bob Martinez", "bob@company.com", "South", "Umbrella Inc", "sales@umbrella.co", "premium", "Gadget X", "GADGET-X01", "Hardware", 3, 89.99, 0, "2024-02-01"),
(5, "Carol White", "carol@company.com", "East", "Initech LLC", "info@initech.com", "basic", "Tool Kit", "TOOLS,001", "Hardware", 1, 199.99, 0, "2024-02-05"),
(6, "Alice Chen", "alice@company.com", "North", "Pendant Corp", "", "free", "Widget Pro", "WIDGET-001", "Electronics", 7, 299.99, 15, "2024-02-10"),
(7, "Carol White", "carol@company.com", "East", "Globex Corp", "globex@corp.com", "enterprise", "Nano Device", "NANO-D01", "Electronics", 2, 549.99, 20, "2024-02-15"),
(8, "Bob Martinez", "bob@company.com", "South", "Umbrella Inc", "sales@umbrella.co", "premium", "Tool Kit", "TOOLS,001", "Hardware", 4, 199.99, 10, "2024-03-01"),
(9, "Alice Chen", "alice@company.com", "North", "Initech LLC", "info@initech.com", "basic", "Nano Device", "NANO-D01", "Electronics", 1, 549.99, 0, "2024-03-05"),
(10, "Carol White", "carol@company.com", "East", "Umbrella Inc", "sales@umbrella.co", "premium", "Cable Bundle","CABLE-5PK", "Accessories", 20, 14.99, 0, "2024-03-10"),
(11, "Bob Martinez", "bob@company.com", "South", "Globex Corp", "globex@corp.com", "enterprise", "Cable Bundle","CABLE-5PK", "Accessories", 15, 14.99, 5, "2024-03-15"),
(12, "Carol White", "carol@company.com", "East", "Pendant Corp", "orders@pendant.io", "free", "Gadget X", "GADGET-X01", "Hardware", 6, 89.99, 0, "2024-03-20"),
]
TASK6_TARGET_DDL = """CREATE TABLE salespersons (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE,
region TEXT NOT NULL
);
CREATE TABLE customers (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE,
tier TEXT NOT NULL
);
CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
sku TEXT NOT NULL UNIQUE,
category TEXT NOT NULL
);
CREATE TABLE sales (
id INTEGER PRIMARY KEY,
salesperson_id INTEGER NOT NULL,
customer_id INTEGER NOT NULL,
product_id INTEGER NOT NULL,
quantity INTEGER NOT NULL,
unit_price REAL NOT NULL,
discount_pct INTEGER NOT NULL DEFAULT 0,
sale_date TEXT NOT NULL,
FOREIGN KEY (salesperson_id) REFERENCES salespersons(id),
FOREIGN KEY (customer_id) REFERENCES customers(id),
FOREIGN KEY (product_id) REFERENCES products(id)
);
CREATE TABLE data_issues (
id INTEGER PRIMARY KEY,
source_table TEXT NOT NULL,
source_row_id INTEGER NOT NULL,
issue_type TEXT NOT NULL,
issue_detail TEXT NOT NULL
);"""
TASK6_EXPECTED_SALESPERSON_COUNT = 3
TASK6_EXPECTED_CUSTOMER_COUNT = 3 # Pendant Corp row 6 excluded (empty email)
TASK6_EXPECTED_PRODUCT_COUNT = 5
TASK6_EXPECTED_SALES_COUNT = 11 # row 6 excluded
TASK6_EXPECTED_DATA_ISSUES_COUNT = 1
def seed_task6(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 6: Multi-Entity Extraction."""
conn.executescript(TASK6_SOURCE_DDL)
conn.executemany(
"INSERT INTO sales_records (id, rep_name, rep_email, rep_region, "
"customer_name, customer_email, customer_tier, product_name, product_sku, "
"product_category, quantity, unit_price, discount_pct, sale_date) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
TASK6_SOURCE_DATA,
)
conn.commit()
# =============================================================================
# TASK 7: Dual-Source Consolidation (Hard)
# =============================================================================
# Agent must merge 6 source tables from two incompatible systems (Legacy CRM + Modern SaaS)
# into 4 unified target tables. Cross-system email dedup, currency coercion, orphan detection.
TASK7_LEGACY_CUSTOMERS_DDL = """
CREATE TABLE legacy_customers (
id INTEGER PRIMARY KEY,
full_name TEXT,
contact_email TEXT,
phone TEXT,
account_type TEXT,
join_date TEXT
);
"""
TASK7_LEGACY_ORDERS_DDL = """
CREATE TABLE legacy_orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER,
product_code TEXT,
total_amount TEXT,
order_status TEXT,
order_date TEXT
);
"""
TASK7_LEGACY_PRODUCTS_DDL = """
CREATE TABLE legacy_products (
code TEXT PRIMARY KEY,
description TEXT,
unit_price TEXT
);
"""
TASK7_MODERN_USERS_DDL = """
CREATE TABLE modern_users (
uuid TEXT PRIMARY KEY,
display_name TEXT,
email_address TEXT,
subscription_tier INTEGER,
created_at TEXT
);
"""
TASK7_MODERN_TRANSACTIONS_DDL = """
CREATE TABLE modern_transactions (
id INTEGER PRIMARY KEY,
user_uuid TEXT,
item_sku TEXT,
amount REAL,
currency TEXT,
tx_status INTEGER,
created_at TEXT
);
"""
TASK7_MODERN_CATALOG_DDL = """
CREATE TABLE modern_catalog (
sku TEXT PRIMARY KEY,
title TEXT,
base_price REAL
);
"""
TASK7_LEGACY_CUSTOMERS_DATA = [
(1, "Alice Johnson", "alice@example.com", "+1-555-0101", "premium", "2021-03-15"),
(2, "Bob Chen", "bob@example.com", "+1-555-0102", "basic", "2022-07-01"),
(3, "Carol Davis", None, "+1-555-0103", "free", "2023-01-10"),
(4, "Dave Wilson", "dave@example.com", "+1-555-0104", "premium", "2021-11-20"),
(5, "Eve Martinez", "eve@example.com", "+1-555-0105", "free", "2023-06-05"),
]
TASK7_MODERN_USERS_DATA = [
("uuid-A1", "Alice J.", "alice@example.com", 3, "2021-03-15"),
("uuid-B2", "R. Bob Chen", "bob@example.com", 2, "2022-07-01"),
("uuid-F6", "Frank Lee", "frank@example.com", 4, "2022-09-30"),
("uuid-G7", "Grace Kim", "grace@example.com", 1, "2024-01-15"),
]
TASK7_LEGACY_ORDERS_DATA = [
(1, 1, "PROD-A", "$1,234.56", "delivered", "2022-01-10"),
(2, 2, "PROD-B", "$89.99", "shipped", "2022-03-15"),
(3, 4, "PROD-A", "$2,500.00", "delivered", "2022-05-20"),
(4, 3, "PROD-C", "$45.00", "pending", "2023-02-01"),
]
TASK7_LEGACY_PRODUCTS_DATA = [
("PROD-A", "Enterprise Widget", "$1,234.56"),
("PROD-B", "Basic Gadget", "$89.99"),
("PROD-C", "Starter Kit", "$45.00"),
]
TASK7_MODERN_TRANSACTIONS_DATA = [
(1, "uuid-A1", "SKU-001", 299.99, "USD", 3, "2023-06-01"),
(2, "uuid-B2", "SKU-002", 89.99, None, 2, "2023-07-15"),
(3, "uuid-F6", "SKU-001", 299.99, None, 3, "2023-08-20"),
(4, "uuid-DEAD", "SKU-003", 15.99, None, 1, "2023-09-01"), # orphan
(5, "uuid-G7", "SKU-002", 89.99, "USD", 4, "2023-10-10"),
(6, "uuid-A1", "SKU-003", 15.99, "EUR", 5, "2023-11-01"),
]
TASK7_MODERN_CATALOG_DATA = [
("SKU-001", "Pro Widget", 299.99),
("SKU-002", "Smart Gadget", 89.99),
("SKU-003", "Mini Accessory", 15.99),
]
TASK7_TARGET_DDL = """CREATE TABLE unified_customers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
legacy_id INTEGER,
modern_uuid TEXT,
name TEXT,
email TEXT,
phone TEXT,
tier TEXT NOT NULL DEFAULT 'free',
source TEXT NOT NULL,
created_at TEXT
);
CREATE TABLE unified_products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
code TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
price REAL NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE unified_orders (
id INTEGER PRIMARY KEY AUTOINCREMENT,
customer_id INTEGER NOT NULL,
product_id INTEGER,
amount REAL NOT NULL,
currency TEXT NOT NULL DEFAULT 'USD',
status TEXT NOT NULL,
order_date TEXT,
source TEXT NOT NULL,
FOREIGN KEY (customer_id) REFERENCES unified_customers(id)
);
CREATE TABLE migration_issues (
id INTEGER PRIMARY KEY,
source_system TEXT NOT NULL,
source_table TEXT NOT NULL,
source_id TEXT NOT NULL,
issue_type TEXT NOT NULL,
resolution TEXT NOT NULL
);"""
TASK7_EXPECTED_UNIFIED_CUSTOMERS = 7
TASK7_EXPECTED_BOTH_SOURCE_COUNT = 2
TASK7_EXPECTED_UNIFIED_ORDERS = 9
TASK7_EXPECTED_MIGRATION_ISSUES = 2
# Tier mapping: 1→'free', 2→'basic', 3→'premium', 4→'enterprise'
TASK7_TIER_MAP = {1: "free", 2: "basic", 3: "premium", 4: "enterprise"}
# Status mapping: 1→'pending', 2→'processing', 3→'complete', 4→'failed', 5→'refunded'
TASK7_STATUS_MAP = {1: "pending", 2: "processing", 3: "complete", 4: "failed", 5: "refunded"}
def seed_task7(conn: sqlite3.Connection) -> None:
"""Seed the database for Task 7: Dual-Source Consolidation."""
conn.executescript(TASK7_LEGACY_CUSTOMERS_DDL)
conn.executescript(TASK7_LEGACY_ORDERS_DDL)
conn.executescript(TASK7_LEGACY_PRODUCTS_DDL)
conn.executescript(TASK7_MODERN_USERS_DDL)
conn.executescript(TASK7_MODERN_TRANSACTIONS_DDL)
conn.executescript(TASK7_MODERN_CATALOG_DDL)
conn.executemany("INSERT INTO legacy_customers VALUES (?, ?, ?, ?, ?, ?)", TASK7_LEGACY_CUSTOMERS_DATA)
conn.executemany("INSERT INTO legacy_orders VALUES (?, ?, ?, ?, ?, ?)", TASK7_LEGACY_ORDERS_DATA)
conn.executemany("INSERT INTO legacy_products VALUES (?, ?, ?)", TASK7_LEGACY_PRODUCTS_DATA)
conn.executemany("INSERT INTO modern_users VALUES (?, ?, ?, ?, ?)", TASK7_MODERN_USERS_DATA)
conn.executemany("INSERT INTO modern_transactions VALUES (?, ?, ?, ?, ?, ?, ?)", TASK7_MODERN_TRANSACTIONS_DATA)
conn.executemany("INSERT INTO modern_catalog VALUES (?, ?, ?)", TASK7_MODERN_CATALOG_DATA)
conn.commit()
# =============================================================================
# Golden Migration Functions
# =============================================================================
# These produce the CORRECT expected database state from any seed data.
# Used by the dynamic grader to compare against the agent's output.
# If seed data changes, the golden DB auto-updates — no hardcoded literals.
def golden_task1(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 1: Column Restructure."""
conn.execute("CREATE TABLE users_new (id INTEGER PRIMARY KEY, full_name TEXT NOT NULL)")
conn.execute(
"INSERT INTO users_new (id, full_name) "
"SELECT id, first_name || ' ' || last_name FROM users"
)
conn.execute("DROP TABLE users")
conn.execute("ALTER TABLE users_new RENAME TO users")
conn.commit()
def golden_task2(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 2: Table Normalization."""
conn.execute("PRAGMA foreign_keys = OFF")
conn.execute(
"CREATE TABLE customers ("
"id INTEGER PRIMARY KEY, name TEXT NOT NULL, email TEXT NOT NULL UNIQUE)"
)
conn.execute(
"INSERT INTO customers (name, email) "
"SELECT DISTINCT customer_name, customer_email FROM purchases"
)
conn.execute(
"CREATE TABLE orders ("
"id INTEGER PRIMARY KEY, customer_id INTEGER NOT NULL, "
"item_name TEXT NOT NULL, price INTEGER NOT NULL, "
"FOREIGN KEY (customer_id) REFERENCES customers(id))"
)
conn.execute(
"INSERT INTO orders (customer_id, item_name, price) "
"SELECT c.id, p.item_name, p.price "
"FROM purchases p JOIN customers c ON p.customer_email = c.email"
)
conn.execute("DROP TABLE purchases")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def golden_task3(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 3: Cascade Migration."""
conn.execute("PRAGMA foreign_keys = OFF")
# Create audit_log
conn.execute(
"CREATE TABLE audit_log (id INTEGER PRIMARY KEY, source_table TEXT NOT NULL, "
"original_row_json TEXT NOT NULL, reason TEXT NOT NULL)"
)
# Log orphaned assets
conn.execute(
"INSERT INTO audit_log (source_table, original_row_json, reason) "
"SELECT 'assets', '{\"id\":' || id || ',\"employee_id\":' || employee_id || '}', 'orphaned_record' "
"FROM assets WHERE employee_id NOT IN (SELECT id FROM employees)"
)
# Log NULL salary employees
conn.execute(
"INSERT INTO audit_log (source_table, original_row_json, reason) "
"SELECT 'employees', '{\"id\":' || id || ',\"name\":\"' || name || '\"}', 'null_salary' "
"FROM employees WHERE salary IS NULL"
)
# Rebuild companies
conn.execute("CREATE TABLE companies_new (id INTEGER PRIMARY KEY, name TEXT NOT NULL)")
conn.execute("INSERT INTO companies_new SELECT id, name FROM companies")
conn.execute("DROP TABLE companies")
conn.execute("ALTER TABLE companies_new RENAME TO companies")
# Rebuild departments
conn.execute(
"CREATE TABLE departments_new (id INTEGER PRIMARY KEY, company_id INTEGER NOT NULL, "
"name TEXT NOT NULL, FOREIGN KEY (company_id) REFERENCES companies(id))"
)
conn.execute("INSERT INTO departments_new SELECT id, company_id, name FROM departments")
conn.execute("DROP TABLE departments")
conn.execute("ALTER TABLE departments_new RENAME TO departments")
# Rebuild employees (remove NULL salary, coerce TEXT to INT)
conn.execute(
"CREATE TABLE employees_new (id INTEGER PRIMARY KEY, department_id INTEGER NOT NULL, "
"name TEXT NOT NULL, salary INTEGER NOT NULL, "
"FOREIGN KEY (department_id) REFERENCES departments(id))"
)
conn.execute(
"INSERT INTO employees_new (id, department_id, name, salary) "
"SELECT id, department_id, name, "
"CAST(REPLACE(REPLACE(salary, '$', ''), ',', '') AS INTEGER) "
"FROM employees WHERE salary IS NOT NULL"
)
conn.execute("DROP TABLE employees")
conn.execute("ALTER TABLE employees_new RENAME TO employees")
# Rebuild assets (remove orphans)
conn.execute(
"CREATE TABLE assets_new (id INTEGER PRIMARY KEY, employee_id INTEGER NOT NULL, "
"description TEXT NOT NULL, FOREIGN KEY (employee_id) REFERENCES employees(id))"
)
conn.execute(
"INSERT INTO assets_new SELECT id, employee_id, description FROM assets "
"WHERE employee_id IN (SELECT id FROM employees)"
)
conn.execute("DROP TABLE assets")
conn.execute("ALTER TABLE assets_new RENAME TO assets")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def golden_task4(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 4: Soft-Delete Restoration."""
conn.execute("PRAGMA foreign_keys = OFF")
# Create new table with extra columns
conn.execute(
"CREATE TABLE products_new (id INTEGER PRIMARY KEY, name TEXT NOT NULL, "
"price REAL NOT NULL, stock INTEGER NOT NULL, "
"is_deleted INTEGER NOT NULL DEFAULT 0, deleted_at TEXT)"
)
# Copy existing products as active
conn.execute(
"INSERT INTO products_new (id, name, price, stock, is_deleted, deleted_at) "
"SELECT id, name, price, stock, 0, NULL FROM products"
)
# Restore deleted products from log
conn.execute(
"INSERT INTO products_new (id, name, price, stock, is_deleted, deleted_at) "
"SELECT product_id, product_name, product_price, product_stock, 1, deleted_at "
"FROM deletion_log"
)
conn.execute("DROP TABLE products")
conn.execute("ALTER TABLE products_new RENAME TO products")
conn.execute("DROP TABLE deletion_log")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def golden_task5(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 5: Schema Version Merge."""
conn.execute("PRAGMA foreign_keys = OFF")
conn.execute(
"CREATE TABLE products (id INTEGER PRIMARY KEY, name TEXT NOT NULL, "
"price REAL NOT NULL, category TEXT, supplier TEXT, brand TEXT, "
"sku TEXT, source TEXT NOT NULL)"
)
# Insert v1-only rows
conn.execute(
"INSERT INTO products (id, name, price, category, supplier, brand, sku, source) "
"SELECT id, name, CAST(REPLACE(REPLACE(price, '$', ''), ',', '') AS REAL), "
"category, supplier, NULL, NULL, 'v1' "
"FROM products_v1 WHERE id NOT IN (SELECT id FROM products_v2)"
)
# Insert v2-only rows
conn.execute(
"INSERT INTO products (id, name, price, category, supplier, brand, sku, source) "
"SELECT id, name, unit_cost, category, NULL, brand, sku, 'v2' "
"FROM products_v2 WHERE id NOT IN (SELECT id FROM products_v1)"
)
# Insert conflict rows (v2 wins for name/price)
conn.execute(
"INSERT INTO products (id, name, price, category, supplier, brand, sku, source) "
"SELECT v2.id, v2.name, v2.unit_cost, v2.category, v1.supplier, v2.brand, v2.sku, 'both' "
"FROM products_v2 v2 JOIN products_v1 v1 ON v2.id = v1.id"
)
conn.execute("DROP TABLE products_v1")
conn.execute("DROP TABLE products_v2")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def golden_task6(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 6: Multi-Entity Extraction."""
conn.execute("PRAGMA foreign_keys = OFF")
# Create target tables
conn.execute(
"CREATE TABLE salespersons (id INTEGER PRIMARY KEY, name TEXT NOT NULL, "
"email TEXT NOT NULL UNIQUE, region TEXT NOT NULL)"
)
conn.execute(
"CREATE TABLE customers (id INTEGER PRIMARY KEY, name TEXT NOT NULL, "
"email TEXT NOT NULL UNIQUE, tier TEXT NOT NULL)"
)
conn.execute(
"CREATE TABLE products (id INTEGER PRIMARY KEY, name TEXT NOT NULL, "
"sku TEXT NOT NULL UNIQUE, category TEXT NOT NULL)"
)
conn.execute(
"CREATE TABLE sales (id INTEGER PRIMARY KEY, salesperson_id INTEGER NOT NULL, "
"customer_id INTEGER NOT NULL, product_id INTEGER NOT NULL, "
"quantity INTEGER NOT NULL, unit_price REAL NOT NULL, "
"discount_pct INTEGER NOT NULL DEFAULT 0, sale_date TEXT NOT NULL, "
"FOREIGN KEY (salesperson_id) REFERENCES salespersons(id), "
"FOREIGN KEY (customer_id) REFERENCES customers(id), "
"FOREIGN KEY (product_id) REFERENCES products(id))"
)
conn.execute(
"CREATE TABLE data_issues (id INTEGER PRIMARY KEY, source_table TEXT NOT NULL, "
"source_row_id INTEGER NOT NULL, issue_type TEXT NOT NULL, "
"issue_detail TEXT NOT NULL)"
)
# Populate salespersons (TRIM email)
conn.execute(
"INSERT INTO salespersons (name, email, region) "
"SELECT DISTINCT rep_name, TRIM(rep_email), rep_region FROM sales_records"
)
# Populate customers (exclude empty email rows)
conn.execute(
"INSERT INTO customers (name, email, tier) "
"SELECT DISTINCT customer_name, customer_email, customer_tier "
"FROM sales_records WHERE customer_email IS NOT NULL AND customer_email != ''"
)
# Populate products
conn.execute(
"INSERT INTO products (name, sku, category) "
"SELECT DISTINCT product_name, product_sku, product_category FROM sales_records"
)
# Populate sales (exclude rows with empty customer email)
conn.execute(
"INSERT INTO sales (salesperson_id, customer_id, product_id, quantity, "
"unit_price, discount_pct, sale_date) "
"SELECT sp.id, c.id, p.id, sr.quantity, sr.unit_price, sr.discount_pct, sr.sale_date "
"FROM sales_records sr "
"JOIN salespersons sp ON TRIM(sr.rep_email) = sp.email "
"JOIN customers c ON sr.customer_email = c.email "
"JOIN products p ON sr.product_sku = p.sku "
"WHERE sr.customer_email IS NOT NULL AND sr.customer_email != ''"
)
# Log data issues (empty email)
conn.execute(
"INSERT INTO data_issues (source_table, source_row_id, issue_type, issue_detail) "
"SELECT 'sales_records', id, 'empty_email', "
"'Customer email is empty for: ' || customer_name "
"FROM sales_records WHERE customer_email IS NULL OR customer_email = ''"
)
conn.execute("DROP TABLE sales_records")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def golden_task7(conn: sqlite3.Connection) -> None:
"""Golden migration for Task 7: Dual-Source Consolidation."""
conn.execute("PRAGMA foreign_keys = OFF")
# Create unified_customers
conn.execute(
"CREATE TABLE unified_customers (id INTEGER PRIMARY KEY AUTOINCREMENT, "
"legacy_id INTEGER, modern_uuid TEXT, name TEXT, email TEXT, phone TEXT, "
"tier TEXT NOT NULL DEFAULT 'free', source TEXT NOT NULL, created_at TEXT)"
)
# Insert legacy-only customers (no email match in modern)
conn.execute(
"INSERT INTO unified_customers (legacy_id, modern_uuid, name, email, phone, tier, source, created_at) "
"SELECT lc.id, NULL, lc.full_name, lc.contact_email, lc.phone, lc.account_type, 'legacy', lc.join_date "
"FROM legacy_customers lc "
"WHERE lc.contact_email IS NULL OR lc.contact_email NOT IN (SELECT email_address FROM modern_users WHERE email_address IS NOT NULL)"
)
# Insert modern-only users (no email match in legacy)
conn.execute(
"INSERT INTO unified_customers (legacy_id, modern_uuid, name, email, phone, tier, source, created_at) "
"SELECT NULL, mu.uuid, mu.display_name, mu.email_address, NULL, "
"CASE mu.subscription_tier "
" WHEN 1 THEN 'free' WHEN 2 THEN 'basic' WHEN 3 THEN 'premium' WHEN 4 THEN 'enterprise' "
" ELSE 'free' END, "
"'modern', mu.created_at "
"FROM modern_users mu "
"WHERE mu.email_address NOT IN (SELECT contact_email FROM legacy_customers WHERE contact_email IS NOT NULL)"
)
# Insert matched (both) customers — legacy name + modern tier
conn.execute(
"INSERT INTO unified_customers (legacy_id, modern_uuid, name, email, phone, tier, source, created_at) "
"SELECT lc.id, mu.uuid, lc.full_name, lc.contact_email, lc.phone, "
"CASE mu.subscription_tier "
" WHEN 1 THEN 'free' WHEN 2 THEN 'basic' WHEN 3 THEN 'premium' WHEN 4 THEN 'enterprise' "
" ELSE 'free' END, "
"'both', lc.join_date "
"FROM legacy_customers lc "
"JOIN modern_users mu ON lc.contact_email = mu.email_address "
"WHERE lc.contact_email IS NOT NULL"
)
# Create unified_products
conn.execute(
"CREATE TABLE unified_products (id INTEGER PRIMARY KEY AUTOINCREMENT, "
"code TEXT NOT NULL UNIQUE, title TEXT NOT NULL, price REAL NOT NULL, "
"source TEXT NOT NULL)"
)
# Legacy products
conn.execute(
"INSERT INTO unified_products (code, title, price, source) "
"SELECT code, description, "
"CAST(REPLACE(REPLACE(unit_price, '$', ''), ',', '') AS REAL), 'legacy' "
"FROM legacy_products"
)
# Modern products (no code overlap expected)
conn.execute(
"INSERT INTO unified_products (code, title, price, source) "
"SELECT sku, title, base_price, 'modern' "
"FROM modern_catalog"
)
# Create migration_issues
conn.execute(
"CREATE TABLE migration_issues (id INTEGER PRIMARY KEY, "
"source_system TEXT NOT NULL, source_table TEXT NOT NULL, "
"source_id TEXT NOT NULL, issue_type TEXT NOT NULL, "
"resolution TEXT NOT NULL)"
)
# Log NULL email customer
conn.execute(
"INSERT INTO migration_issues (source_system, source_table, source_id, issue_type, resolution) "
"SELECT 'legacy', 'legacy_customers', CAST(id AS TEXT), 'null_email', "
"'Imported without email' "
"FROM legacy_customers WHERE contact_email IS NULL"
)
# Log orphaned transactions
conn.execute(
"INSERT INTO migration_issues (source_system, source_table, source_id, issue_type, resolution) "
"SELECT 'modern', 'modern_transactions', CAST(id AS TEXT), 'orphaned_record', "
"'User UUID not found: ' || user_uuid "
"FROM modern_transactions WHERE user_uuid NOT IN (SELECT uuid FROM modern_users)"
)
# Create unified_orders
conn.execute(
"CREATE TABLE unified_orders (id INTEGER PRIMARY KEY AUTOINCREMENT, "
"customer_id INTEGER NOT NULL, product_id INTEGER, amount REAL NOT NULL, "
"currency TEXT NOT NULL DEFAULT 'USD', status TEXT NOT NULL, "
"order_date TEXT, source TEXT NOT NULL, "
"FOREIGN KEY (customer_id) REFERENCES unified_customers(id))"
)
# Legacy orders
conn.execute(
"INSERT INTO unified_orders (customer_id, product_id, amount, currency, status, order_date, source) "
"SELECT uc.id, up.id, "
"CAST(REPLACE(REPLACE(lo.total_amount, '$', ''), ',', '') AS REAL), "
"'USD', lo.order_status, lo.order_date, 'legacy' "
"FROM legacy_orders lo "
"JOIN legacy_customers lc ON lo.customer_id = lc.id "
"JOIN unified_customers uc ON (uc.legacy_id = lc.id) "
"LEFT JOIN unified_products up ON lo.product_code = up.code"
)
# Modern transactions (exclude orphans)
conn.execute(
"INSERT INTO unified_orders (customer_id, product_id, amount, currency, status, order_date, source) "
"SELECT uc.id, up.id, mt.amount, "
"COALESCE(mt.currency, 'USD'), "
"CASE mt.tx_status "
" WHEN 1 THEN 'pending' WHEN 2 THEN 'processing' WHEN 3 THEN 'complete' "
" WHEN 4 THEN 'failed' WHEN 5 THEN 'refunded' ELSE 'unknown' END, "
"mt.created_at, 'modern' "
"FROM modern_transactions mt "
"JOIN modern_users mu ON mt.user_uuid = mu.uuid "
"JOIN unified_customers uc ON (uc.modern_uuid = mu.uuid OR uc.email = mu.email_address) "
"LEFT JOIN unified_products up ON mt.item_sku = up.code"
)
# Clean up source tables
conn.execute("DROP TABLE legacy_customers")
conn.execute("DROP TABLE legacy_orders")
conn.execute("DROP TABLE legacy_products")
conn.execute("DROP TABLE modern_users")
conn.execute("DROP TABLE modern_transactions")
conn.execute("DROP TABLE modern_catalog")
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
# =============================================================================
# TASK 8: Data Poisoning & Quarantine Routing (Extreme)
# =============================================================================
TASK8_TARGET_DDL = """
CREATE TABLE inventory (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
price REAL NOT NULL,
sku TEXT UNIQUE
);
CREATE TABLE inventory_quarantine (
id INTEGER PRIMARY KEY,
raw_name TEXT,
raw_price TEXT,
raw_sku TEXT,
error_reason TEXT
);
""".strip()
def seed_task8(conn):
conn.execute("CREATE TABLE staging_data (id INTEGER, item TEXT, cost TEXT, sku_code TEXT)")
data = [
(1, "Oscilloscope", "1500.00", "OSC-001"),
(2, "Multimeter", " 75.50 ", "MUL-002"),
(3, "Soldering Iron", "$45.00", "SLD-003"),
(4, "Lead Solder", "N/A", "LSD-004"),
(5, "DC Power Supply", "299.99", "PWR-005"),
(6, "Unknown Device", "INVALID", "UNK-006"),
(7, "Wire Strippers", "$ 12.50", "WRE-007"),
]
conn.executemany("INSERT INTO staging_data VALUES (?,?,?,?)", data)
conn.commit()
def golden_task8(conn):
conn.execute("CREATE TABLE inventory (id INTEGER PRIMARY KEY, name TEXT NOT NULL, price REAL NOT NULL, sku TEXT UNIQUE)")
conn.execute("CREATE TABLE inventory_quarantine (id INTEGER PRIMARY KEY, raw_name TEXT, raw_price TEXT, raw_sku TEXT, error_reason TEXT)")
# Process staging_data
cursor = conn.execute("SELECT id, item, cost, sku_code FROM staging_data")
for row in cursor.fetchall():
rid, name, cost, sku = row
clean_cost = cost.replace("$", "").strip()
try:
price = float(clean_cost)
conn.execute("INSERT INTO inventory (id, name, price, sku) VALUES (?,?,?,?)", (rid, name, price, sku))
except ValueError:
conn.execute("INSERT INTO inventory_quarantine (raw_name, raw_price, raw_sku, error_reason) VALUES (?,?,?,?)",
(name, cost, sku, "invalid_numeric_format"))
conn.commit()
# =============================================================================
# Task Registry
# =============================================================================
TASKS = {
"column-restructure": {
"seed_fn": seed_task1,
"golden_fn": golden_task1,
"target_ddl": TASK1_TARGET_DDL,
"description": "Merge first_name and last_name into a single full_name column (concatenated with a space) without data loss. Apostrophes in names (e.g., O'Brien) must be preserved.",
"difficulty": "easy",
"max_steps": 10,
},
"soft-delete-restoration": {
"seed_fn": seed_task4,
"golden_fn": golden_task4,
"target_ddl": TASK4_TARGET_DDL,
"description": "Restore deleted products from the deletion_log table back into the products table. Use product_id from deletion_log (NOT the log's id column) as the primary key. Add is_deleted (1) and deleted_at values from log. Original rows stay as is_deleted=0, deleted_at=NULL.",
"difficulty": "easy",
"max_steps": 10,
},
"table-normalization": {
"seed_fn": seed_task2,
"golden_fn": golden_task2,
"target_ddl": TASK2_TARGET_DDL,
"description": "Normalize a flat purchases table into customers and orders tables linked by customer_id (FK). Ensure customers are distinct by email.",
"difficulty": "medium",
"max_steps": 15,
},
"schema-version-merge": {
"seed_fn": seed_task5,
"golden_fn": golden_task5,
"target_ddl": TASK5_TARGET_DDL,
"description": "Merge products_v1 (Legacy) and products_v2 (Modern) with ID collision logic: Modern (v2) wins. Coerce v1 price strings ($) to REAL.",
"difficulty": "medium",
"max_steps": 15,
},
"multi-entity-extraction": {
"seed_fn": seed_task6,
"golden_fn": golden_task6,
"target_ddl": TASK6_TARGET_DDL,
"description": "Decompose sales_records into 3NF: salespersons, customers, products, and sales. Route rows with missing emails to data_issues.",
"difficulty": "medium",
"max_steps": 15,
},
"cascade-migration": {
"seed_fn": seed_task3,
"golden_fn": golden_task3,
"target_ddl": TASK3_TARGET_DDL,
"description": "Multi-table FK cascade with type coercion for salary and orphan logging for assets.",
"difficulty": "hard",
"max_steps": 20,
},
"dual-source-consolidation": {
"seed_fn": seed_task7,
"golden_fn": golden_task7,
"target_ddl": TASK7_TARGET_DDL,
"description": "Consolidate Legacy CRM and Modern SaaS data with cross-system email deduping and complex state/type mapping.",
"difficulty": "hard",
"max_steps": 20,
},
"data-poisoning-quarantine": {
"seed_fn": seed_task8,
"golden_fn": golden_task8,
"target_ddl": TASK8_TARGET_DDL,
"description": "The ultimate technical test: Migrate inventory from a 'poisoned' staging table. Cleanse raw price strings and route un-coerceable rows (like 'N/A') to a quarantine table while maintaining strict schema integrity.",
"difficulty": "extreme",
"max_steps": 15,
},
}