Spaces:

Angelgupta
/

mlops-openenv

Sleeping

App Files Files Community

mlops-openenv / artifact_generator.py

trretretret

Deploy ML pipeline debugging environment to HF Spaces

7e782aa 12 days ago

raw

history blame contribute delete

47.6 kB

	"""
	Artifact Generator for MLOps Pipeline Debugger

	Generates a full set of realistic ML training artifacts for a given bug scenario.
	Each artifact is internally consistent — config matches logs, dataset stats match
	preprocessing code — except for the one planted fault.

	Bug types supported:
	Task 1 (easy):
	- exploding_lr : learning_rate too large → loss diverges to NaN
	- wrong_optimizer : SGD with momentum=0.99 on non-convex problem
	- batch_size_overflow: batch_size > dataset size → trivial overfitting signal

	Task 2 (medium):
	- data_leakage_scaler : StandardScaler fit on full dataset before split
	- data_leakage_overlap : train/val split with random_state=None → overlap
	- wrong_split_ratio : test data accidentally included in training

	Task 3 (hard):
	- label_encoder_mismatch : train/eval use different LabelEncoder.fit() orderings
	- silent_metric_swap : val and test metric names swapped in eval code
	- tokenizer_version_drift: training uses tokenizer v1, eval uses v2 (different vocab)
	"""

	from __future__ import annotations

	import json
	import random
	import textwrap
	from dataclasses import dataclass, field
	from typing import Dict, Tuple

	import numpy as np


	# ─── Bug Specifications ───────────────────────────────────────────────────────

	@dataclass
	class BugSpec:
	bug_type: str
	category: str # maps to failure_category in Action
	file: str # root_cause_file
	field: str # root_cause_field
	gold_fix: str
	task_difficulty: str # easy / medium / hard


	BUG_CATALOGUE: Dict[str, BugSpec] = {
	# ── EASY ──────────────────────────────────────────────────────────────────
	"exploding_lr": BugSpec(
	bug_type="exploding_lr",
	category="config_error",
	file="config.yaml",
	field="optimizer.learning_rate",
	gold_fix="Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
	task_difficulty="easy",
	),
	"wrong_optimizer": BugSpec(
	bug_type="wrong_optimizer",
	category="config_error",
	file="config.yaml",
	field="optimizer.momentum",
	gold_fix="Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
	task_difficulty="easy",
	),
	"batch_size_overflow": BugSpec(
	bug_type="batch_size_overflow",
	category="config_error",
	file="config.yaml",
	field="training.batch_size",
	gold_fix="Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
	task_difficulty="easy",
	),

	# ── MEDIUM ────────────────────────────────────────────────────────────────
	"data_leakage_scaler": BugSpec(
	bug_type="data_leakage_scaler",
	category="data_leakage",
	file="preprocessing.py",
	field="StandardScaler.fit_transform",
	gold_fix="Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
	task_difficulty="medium",
	),
	"data_leakage_overlap": BugSpec(
	bug_type="data_leakage_overlap",
	category="data_leakage",
	file="preprocessing.py",
	field="train_test_split.random_state",
	gold_fix="Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
	task_difficulty="medium",
	),
	"wrong_split_ratio": BugSpec(
	bug_type="wrong_split_ratio",
	category="preprocessing_bug",
	file="preprocessing.py",
	field="train_test_split.test_size",
	gold_fix="Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
	task_difficulty="medium",
	),

	# ── HARD ──────────────────────────────────────────────────────────────────
	"label_encoder_mismatch": BugSpec(
	bug_type="label_encoder_mismatch",
	category="label_mismatch",
	file="preprocessing.py",
	field="LabelEncoder.fit_order",
	gold_fix="Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
	task_difficulty="hard",
	),
	"silent_metric_swap": BugSpec(
	bug_type="silent_metric_swap",
	category="evaluation_bug",
	file="eval_results.json",
	field="metrics.val_accuracy",
	gold_fix="Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
	task_difficulty="hard",
	),
	"tokenizer_version_drift": BugSpec(
	bug_type="tokenizer_version_drift",
	category="evaluation_bug",
	file="preprocessing.py",
	field="tokenizer.version",
	gold_fix="Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
	task_difficulty="hard",
	),
	}

	TASK_BUG_POOLS = {
	"easy": ["exploding_lr", "wrong_optimizer", "batch_size_overflow"],
	"medium": ["data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"],
	"hard": ["label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"],
	}


	# ─── Model / Dataset Configs (variety pool) ───────────────────────────────────

	MODEL_CONFIGS = [
	{"name": "ResNet-50", "type": "image_classification", "params": "25.6M",
	"dataset": "ImageNet-subset-10k", "num_classes": 10, "input": "224x224 RGB"},
	{"name": "BERT-base-uncased", "type": "text_classification", "params": "110M",
	"dataset": "SST-2", "num_classes": 2, "input": "tokenized sequences, max_len=128"},
	{"name": "EfficientNet-B3", "type": "image_classification", "params": "12.2M",
	"dataset": "CIFAR-100", "num_classes": 100, "input": "300x300 RGB"},
	{"name": "DistilBERT", "type": "sentiment_analysis", "params": "66M",
	"dataset": "IMDB-reviews", "num_classes": 3, "input": "tokenized sequences, max_len=256"},
	{"name": "MobileNetV3-Large", "type": "image_classification", "params": "5.4M",
	"dataset": "Oxford-102-Flowers", "num_classes": 102, "input": "224x224 RGB"},
	]

	OPTIMIZERS = ["AdamW", "SGD", "RMSprop", "Adam"]
	SCHEDULERS = ["cosine_annealing", "step_lr", "reduce_on_plateau", "linear_warmup"]


	# ─── Artifact Generators ──────────────────────────────────────────────────────

	class ArtifactGenerator:
	"""
	Generates all 6 training artifacts for a given (bug_type, seed) pair.
	All artifacts are internally consistent except for the planted fault.
	"""

	def __init__(self, bug_type: str, seed: int):
	self.bug = BUG_CATALOGUE[bug_type]
	self.seed = seed
	self.rng = random.Random(seed)
	self.np_rng = np.random.RandomState(seed)

	# Pick a model config deterministically
	self.model_cfg = self.rng.choice(MODEL_CONFIGS)
	self.optimizer_name = self.rng.choice(OPTIMIZERS)
	self.scheduler_name = self.rng.choice(SCHEDULERS)
	self.run_id = f"run_{seed:04d}_{bug_type[:6]}"

	# Normal hyperparams
	self.lr = self.rng.choice([1e-5, 3e-5, 1e-4, 3e-4])
	self.batch_size = self.rng.choice([16, 32, 64])
	self.epochs = self.rng.randint(8, 20)
	self.weight_decay = self.rng.choice([0.01, 0.001, 1e-4])
	self.momentum = 0.9
	self.train_samples = self.rng.randint(8000, 15000)
	self.val_samples = int(self.train_samples * 0.2)
	self.test_samples = int(self.train_samples * 0.15)

	def generate_all(self) -> Dict[str, str]:
	return {
	"config.yaml": self._gen_config(),
	"train.log": self._gen_train_log(),
	"dataset_stats.json": self._gen_dataset_stats(),
	"preprocessing.py": self._gen_preprocessing(),
	"eval_results.json": self._gen_eval_results(),
	"model_card.json": self._gen_model_card(),
	}

	# ── config.yaml ──────────────────────────────────────────────────────────

	def _gen_config(self) -> str:
	lr = self.lr
	batch_size = self.batch_size
	momentum = self.momentum

	if self.bug.bug_type == "exploding_lr":
	lr = self.rng.choice([50.0, 10.0, 25.0])
	elif self.bug.bug_type == "wrong_optimizer":
	momentum = 0.99
	self.optimizer_name = "SGD"
	elif self.bug.bug_type == "batch_size_overflow":
	batch_size = self.rng.choice([2048, 4096, 8192])

	return textwrap.dedent(f"""\
	# Training Configuration
	# Run ID: {self.run_id}
	# Generated: 2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(0,23):02d}:{self.rng.randint(0,59):02d}:00Z

	model:
	architecture: {self.model_cfg['name']}
	num_classes: {self.model_cfg['num_classes']}
	pretrained: true
	pretrained_source: "timm/torchvision"
	dropout: {self.rng.choice([0.1, 0.2, 0.3])}
	freeze_backbone_epochs: {self.rng.randint(0, 3)}

	training:
	epochs: {self.epochs}
	batch_size: {batch_size}
	num_workers: {self.rng.choice([4, 8])}
	pin_memory: true
	mixed_precision: {str(self.rng.choice([True, False])).lower()}
	gradient_clip_norm: {self.rng.choice([1.0, 5.0, "null"])}
	early_stopping_patience: {self.rng.randint(3, 7)}
	seed: {self.seed}

	optimizer:
	name: {self.optimizer_name}
	learning_rate: {lr}
	weight_decay: {self.weight_decay}
	momentum: {momentum}
	betas: [0.9, 0.999]

	scheduler:
	name: {self.scheduler_name}
	warmup_epochs: {self.rng.randint(0, 3)}
	min_lr: 1.0e-7
	t_max: {self.epochs}

	data:
	dataset: {self.model_cfg['dataset']}
	input_size: "{self.model_cfg['input']}"
	train_split: 0.8
	val_split: 0.1
	test_split: 0.1
	augmentation:
	random_crop: true
	horizontal_flip: {str(self.rng.choice([True, False])).lower()}
	color_jitter: {self.rng.choice([0.2, 0.4])}
	normalize_mean: [0.485, 0.456, 0.406]
	normalize_std: [0.229, 0.224, 0.225]

	logging:
	log_interval: 50
	save_best_only: true
	checkpoint_dir: "./checkpoints/{self.run_id}"
	wandb_project: "mlops-debug-bench"
	""")

	# ── train.log ────────────────────────────────────────────────────────────

	def _gen_train_log(self) -> str:
	lines = []
	lines.append(f"[INFO 2024-03-{self.rng.randint(1,28):02d} {self.rng.randint(6,10):02d}:00:00] Starting training run: {self.run_id}")
	lines.append(f"[INFO ] Model: {self.model_cfg['name']} \| Params: {self.model_cfg['params']}")
	lines.append(f"[INFO ] Dataset: {self.model_cfg['dataset']} \| Train: {self.train_samples:,} \| Val: {self.val_samples:,}")
	lines.append(f"[INFO ] Device: cuda:0 \| Mixed precision: fp16")
	lines.append(f"[INFO ] Optimizer: {self.optimizer_name} \| LR: {self.lr} \| Batch: {self.batch_size}")
	lines.append("[INFO ] ─" * 30)

	bug = self.bug.bug_type

	if bug == "exploding_lr":
	# Loss explodes rapidly
	loss = 2.302
	for ep in range(1, min(self.epochs + 1, 6)):
	acc = max(0.0, 0.12 - ep * 0.02)
	val_loss = loss * self.rng.uniform(1.1, 1.5)
	val_acc = max(0.0, acc - 0.05)
	lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
	f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
	f"lr={self.lr:.2e} grad_norm={loss * 18.3:.2f} "
	f"time={self.rng.randint(45,90)}s")
	if ep == 1: lines.append(f"[WARN ] Gradient norm unusually high: {loss * 18.3:.2f} (threshold: 10.0)")
	loss = loss * self.rng.uniform(4.5, 9.0)
	if loss > 1e6:
	lines.append(f"[EPOCH {ep+1:03d}] train_loss=nan train_acc=0.1000 val_loss=nan val_acc=0.1000 "
	f"lr={self.lr:.2e} grad_norm=nan time={self.rng.randint(45,90)}s")
	lines.append(f"[ERROR ] Loss is NaN at epoch {ep+1}, step {self.rng.randint(100,300)}. Training halted.")
	lines.append(f"[ERROR ] Last finite loss: {loss / self.rng.uniform(4,9):.2f}. Gradient explosion detected.")
	break

	elif bug == "wrong_optimizer":
	# Loss oscillates wildly, never converges
	loss = 2.302
	for ep in range(1, self.epochs + 1):
	delta = self.rng.uniform(-0.8, 1.2)
	loss = max(1.8, loss + delta)
	acc = self.rng.uniform(0.10, 0.25)
	val_loss = loss + self.rng.uniform(-0.3, 0.8)
	val_acc = self.rng.uniform(0.09, 0.22)
	lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
	f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
	f"lr={self.lr:.2e} grad_norm={self.rng.uniform(8.2, 45.1):.2f} "
	f"time={self.rng.randint(45,90)}s")
	if ep % 3 == 0:
	lines.append(f"[WARN ] Loss oscillation detected over last 3 epochs: {loss+0.4:.3f} → {loss-0.5:.3f} → {loss:.3f}")

	elif bug == "batch_size_overflow":
	# Val accuracy hits 100% immediately — model memorizes tiny effective dataset
	for ep in range(1, self.epochs + 1):
	train_loss = max(0.001, 2.302 * (0.05 ** ep))
	train_acc = min(1.0, 0.3 + ep * 0.09)
	val_acc = 0.999 if ep >= 2 else 0.85
	val_loss = 0.001 if ep >= 2 else 0.45
	lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
	f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
	f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,0.9):.3f} "
	f"time={self.rng.randint(3,8)}s")
	lines.append(f"[WARN ] Effective steps per epoch: {max(1, self.train_samples // 4096)}. Dataset may be smaller than batch size.")

	elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
	# Val accuracy suspiciously high from epoch 1
	for ep in range(1, self.epochs + 1):
	train_loss = max(0.01, 0.45 - ep * 0.02)
	train_acc = min(0.98, 0.72 + ep * 0.015)
	val_acc = min(0.999, 0.984 + self.rng.uniform(-0.002, 0.002)) if ep >= 1 else 0.71
	val_loss = max(0.001, 0.04 - ep * 0.001)
	lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
	f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
	f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,1.2):.3f} "
	f"time={self.rng.randint(45,90)}s")
	lines.append(f"[INFO ] Best model saved at epoch 2: val_acc=0.9841")
	lines.append(f"[WARN ] Val accuracy reached 98.4% at epoch 1 — verify no data leakage.")

	elif bug in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
	# Training looks completely normal — the bug is silent
	best_val = 0.0
	for ep in range(1, self.epochs + 1):
	train_loss = max(0.08, 1.8 * (0.72 ** ep) + self.rng.uniform(-0.02, 0.02))
	train_acc = min(0.96, 0.42 + ep * 0.032 + self.rng.uniform(-0.01, 0.01))
	val_loss = train_loss * self.rng.uniform(1.05, 1.15)
	val_acc = train_acc - self.rng.uniform(0.02, 0.06)
	best_val = max(best_val, val_acc)
	lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
	f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
	f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.3, 2.1):.3f} "
	f"time={self.rng.randint(60,120)}s")
	lines.append(f"[INFO ] Training complete. Best val_acc={best_val:.4f} at epoch {self.rng.randint(self.epochs-3, self.epochs)}")
	lines.append(f"[INFO ] Checkpoint saved: ./checkpoints/{self.run_id}/best_model.pt")

	lines.append("[INFO ] ─" * 30)
	lines.append(f"[INFO ] Run {self.run_id} finished.")
	return "\n".join(lines)

	# ── dataset_stats.json ───────────────────────────────────────────────────

	def _gen_dataset_stats(self) -> str:
	n_classes = self.model_cfg["num_classes"]
	train_n = self.train_samples
	val_n = self.val_samples
	test_n = self.test_samples

	overlap_count = 0
	if self.bug.bug_type == "data_leakage_overlap":
	overlap_count = self.rng.randint(int(val_n * 0.15), int(val_n * 0.30))
	elif self.bug.bug_type == "wrong_split_ratio":
	# Train and test flipped
	train_n, test_n = test_n, train_n

	# Class distribution (roughly uniform with jitter)
	def class_dist(total, n_cls):
	base = total // n_cls
	counts = {str(i): base + self.rng.randint(-int(base0.15), int(base0.15))
	for i in range(min(n_cls, 10))}
	if n_cls > 10:
	counts["..."] = f"{n_cls - 10} more classes"
	return counts

	stats = {
	"dataset": self.model_cfg["dataset"],
	"num_classes": n_classes,
	"splits": {
	"train": {
	"n_samples": train_n,
	"class_distribution": class_dist(train_n, n_classes),
	},
	"val": {
	"n_samples": val_n,
	"class_distribution": class_dist(val_n, n_classes),
	"overlap_with_train": overlap_count,
	},
	"test": {
	"n_samples": test_n,
	"class_distribution": class_dist(test_n, n_classes),
	},
	},
	"feature_statistics": {
	"mean": round(self.np_rng.uniform(0.45, 0.55), 4),
	"std": round(self.np_rng.uniform(0.22, 0.28), 4),
	"min": 0.0,
	"max": 1.0,
	"null_count": 0,
	},
	"preprocessing_applied": [
	"resize",
	"normalize",
	"label_encode",
	"train_val_test_split",
	],
	"random_seed_used": self.seed if self.bug.bug_type != "data_leakage_overlap" else None,
	}
	return json.dumps(stats, indent=2)

	# ── preprocessing.py ─────────────────────────────────────────────────────

	def _gen_preprocessing(self) -> str:
	bug = self.bug.bug_type

	if bug == "data_leakage_scaler":
	return textwrap.dedent(f"""\
	\"\"\"
	Data preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.model_selection import train_test_split


	def load_raw_data(data_dir: str):
	\"\"\"Load features and labels from disk.\"\"\"
	X = np.load(f"{{data_dir}}/features.npy")
	y = np.load(f"{{data_dir}}/labels.npy")
	return X, y


	def preprocess(data_dir: str, seed: int = {self.seed}):
	X, y = load_raw_data(data_dir)

	# Encode labels
	le = LabelEncoder()
	y_encoded = le.fit_transform(y)

	# ── BUG: Scaler fit on full dataset BEFORE split ──────────
	scaler = StandardScaler()
	X_normalized = scaler.fit_transform(X) # sees val/test data during fit!
	# ─────────────────────────────────────────────────────────

	X_train, X_temp, y_train, y_temp = train_test_split(
	X_normalized, y_encoded, test_size=0.2, random_state=seed
	)
	X_val, X_test, y_val, y_test = train_test_split(
	X_temp, y_temp, test_size=0.5, random_state=seed
	)

	return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le


	def get_transforms(split: str):
	\"\"\"Get augmentation transforms for a given split.\"\"\"
	if split == "train":
	return [
	("random_horizontal_flip", {{"p": 0.5}}),
	("random_crop", {{"size": 224, "padding": 4}}),
	("color_jitter", {{"brightness": 0.2, "contrast": 0.2}}),
	("normalize", {{"mean": [0.485, 0.456, 0.406],
	"std": [0.229, 0.224, 0.225]}}),
	]
	return [
	("center_crop", {{"size": 224}}),
	("normalize", {{"mean": [0.485, 0.456, 0.406],
	"std": [0.229, 0.224, 0.225]}}),
	]
	""")

	elif bug == "data_leakage_overlap":
	return textwrap.dedent(f"""\
	\"\"\"
	Data preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	import numpy as np
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.model_selection import train_test_split


	def load_raw_data(data_dir: str):
	X = np.load(f"{{data_dir}}/features.npy")
	y = np.load(f"{{data_dir}}/labels.npy")
	return X, y


	def preprocess(data_dir: str):
	X, y = load_raw_data(data_dir)

	le = LabelEncoder()
	y_encoded = le.fit_transform(y)

	# First split: train vs temp
	# ── BUG: random_state=None → non-reproducible, overlapping splits ──
	X_train, X_temp, y_train, y_temp = train_test_split(
	X, y_encoded, test_size=0.2, random_state=None # ← should be fixed seed
	)
	# Second split: val vs test (ALSO non-deterministic)
	X_val, X_test, y_val, y_test = train_test_split(
	X_temp, y_temp, test_size=0.5, random_state=None # ← should be fixed seed
	)
	# ─────────────────────────────────────────────────────────

	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
	""")

	elif bug == "wrong_split_ratio":
	return textwrap.dedent(f"""\
	\"\"\"
	Data preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	import numpy as np
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.model_selection import train_test_split


	def preprocess(data_dir: str, seed: int = {self.seed}):
	X = np.load(f"{{data_dir}}/features.npy")
	y = np.load(f"{{data_dir}}/labels.npy")

	le = LabelEncoder()
	y_encoded = le.fit_transform(y)

	# ── BUG: test_size=0.8 — trains on 20%, evaluates on 80% ──
	X_train, X_test, y_train, y_test = train_test_split(
	X, y_encoded, test_size=0.8, random_state=seed # ← should be 0.2
	)
	X_val, X_test, y_val, y_test = train_test_split(
	X_test, y_test, test_size=0.5, random_state=seed
	)
	# ──────────────────────────────────────────────────────────

	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
	""")

	elif bug == "label_encoder_mismatch":
	classes = ["cat", "dog", "bird"] if self.model_cfg["num_classes"] <= 10 else \
	[f"class_{i}" for i in range(min(self.model_cfg["num_classes"], 5))]
	classes_shuffled = classes.copy()
	self.rng.shuffle(classes_shuffled)
	return textwrap.dedent(f"""\
	\"\"\"
	Data preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}

	WARNING: Training and evaluation pipelines are defined separately.
	Ensure they use identical label encoding.
	\"\"\"
	import numpy as np
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split


	# ── Training pipeline ─────────────────────────────────────────
	def build_train_pipeline(data_dir: str, seed: int = {self.seed}):
	X = np.load(f"{{data_dir}}/train_features.npy")
	y_raw = np.load(f"{{data_dir}}/train_labels.npy", allow_pickle=True)

	# LabelEncoder fitted on training class order
	le_train = LabelEncoder()
	le_train.fit({classes}) # alphabetical order: {sorted(classes)}
	y = le_train.transform(y_raw)

	X_train, X_val, y_train, y_val = train_test_split(
	X, y, test_size=0.2, random_state=seed
	)
	return (X_train, y_train), (X_val, y_val), le_train


	# ── Evaluation pipeline ───────────────────────────────────────
	def build_eval_pipeline(data_dir: str):
	X_test = np.load(f"{{data_dir}}/test_features.npy")
	y_raw = np.load(f"{{data_dir}}/test_labels.npy", allow_pickle=True)

	# ── BUG: Different LabelEncoder instance with DIFFERENT fit order ──
	le_eval = LabelEncoder()
	le_eval.fit({classes_shuffled}) # ← shuffled order: {classes_shuffled}
	y_test = le_eval.transform(y_raw)
	# ─────────────────────────────────────────────────────────

	return X_test, y_test, le_eval
	""")

	elif bug == "silent_metric_swap":
	val_acc = round(self.rng.uniform(0.84, 0.91), 4)
	test_acc = round(self.rng.uniform(0.31, 0.39), 4)
	return textwrap.dedent(f"""\
	\"\"\"
	Evaluation script for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	import torch
	import json


	def evaluate(model, val_loader, test_loader, device="cuda"):
	model.eval()
	results = {{}}

	with torch.no_grad():
	# Evaluate on validation set
	val_correct, val_total = 0, 0
	for X, y in val_loader:
	preds = model(X.to(device)).argmax(dim=1)
	val_correct += (preds == y.to(device)).sum().item()
	val_total += y.size(0)
	val_acc = val_correct / val_total

	# Evaluate on test set
	test_correct, test_total = 0, 0
	for X, y in test_loader:
	preds = model(X.to(device)).argmax(dim=1)
	test_correct += (preds == y.to(device)).sum().item()
	test_total += y.size(0)
	test_acc = test_correct / test_total

	# ── BUG: val and test accuracy assignments are swapped ──
	results["val_accuracy"] = test_acc # ← should be val_acc
	results["test_accuracy"] = val_acc # ← should be test_acc
	# ──────────────────────────────────────────────────────

	results["val_loss"] = round(1 - val_acc + 0.12, 4)
	results["test_loss"] = round(1 - test_acc + 0.09, 4)
	return results
	""")

	elif bug == "tokenizer_version_drift":
	return textwrap.dedent(f"""\
	\"\"\"
	Text preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	from transformers import AutoTokenizer


	TOKENIZER_V1 = "bert-base-uncased" # vocab size: 30,522
	TOKENIZER_V2 = "bert-base-uncased-v2-fixed" # vocab size: 30,522 + 847 domain tokens


	# ── Training pipeline ─────────────────────────────────────────
	def get_train_tokenizer():
	\"\"\"Tokenizer used during training.\"\"\"
	# Updated to v2 for domain-specific vocabulary
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V2)
	return tokenizer


	# ── Evaluation pipeline ───────────────────────────────────────
	def get_eval_tokenizer():
	\"\"\"Tokenizer used during evaluation and inference.\"\"\"
	# ── BUG: Still using v1 — 847 tokens map to [UNK] during eval ──
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V1) # ← should be TOKENIZER_V2
	return tokenizer
	# ─────────────────────────────────────────────────────────


	def tokenize_batch(texts, tokenizer, max_length: int = 128):
	return tokenizer(
	texts,
	padding="max_length",
	truncation=True,
	max_length=max_length,
	return_tensors="pt",
	)
	""")

	else:
	# Default normal preprocessing (for config-error bugs, preprocessing is clean)
	return textwrap.dedent(f"""\
	\"\"\"
	Data preprocessing pipeline for {self.model_cfg['dataset']}
	Run ID: {self.run_id}
	\"\"\"
	import numpy as np
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.model_selection import train_test_split


	def preprocess(data_dir: str, seed: int = {self.seed}):
	X = np.load(f"{{data_dir}}/features.npy")
	y = np.load(f"{{data_dir}}/labels.npy")

	le = LabelEncoder()
	y_encoded = le.fit_transform(y)

	X_train, X_temp, y_train, y_temp = train_test_split(
	X, y_encoded, test_size=0.2, random_state=seed
	)
	X_val, X_test, y_val, y_test = train_test_split(
	X_temp, y_temp, test_size=0.5, random_state=seed
	)

	# Correct: fit only on training data
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
	""")

	# ── eval_results.json ────────────────────────────────────────────────────

	def _gen_eval_results(self) -> str:
	bug = self.bug.bug_type

	if bug in ("exploding_lr", "wrong_optimizer"):
	val_acc = round(self.rng.uniform(0.09, 0.13), 4)
	test_acc = round(self.rng.uniform(0.09, 0.13), 4)
	val_loss = 999999.9 if bug == "exploding_lr" else round(self.rng.uniform(2.1, 2.4), 4)
	test_loss = val_loss
	elif bug == "batch_size_overflow":
	val_acc = 0.9990
	test_acc = round(self.rng.uniform(0.11, 0.15), 4) # massive train/test gap
	val_loss, test_loss = 0.0003, round(self.rng.uniform(1.8, 2.3), 4)
	elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
	val_acc = round(self.rng.uniform(0.982, 0.998), 4)
	test_acc = round(self.rng.uniform(0.61, 0.73), 4) # test is much worse (no leakage)
	val_loss = round(self.rng.uniform(0.004, 0.015), 4)
	test_loss = round(self.rng.uniform(0.42, 0.68), 4)
	elif bug == "label_encoder_mismatch":
	val_acc = round(self.rng.uniform(0.84, 0.91), 4)
	test_acc = round(self.rng.uniform(0.30, 0.38), 4) # near random for 3-class
	val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.15), 4)
	test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.15), 4)
	elif bug == "silent_metric_swap":
	real_val = round(self.rng.uniform(0.84, 0.91), 4)
	real_test = round(self.rng.uniform(0.31, 0.39), 4)
	# Swapped in output
	val_acc = real_test
	test_acc = real_val
	val_loss = round(1 - real_test + 0.09, 4)
	test_loss = round(1 - real_val + 0.12, 4)
	elif bug == "tokenizer_version_drift":
	val_acc = round(self.rng.uniform(0.83, 0.88), 4)
	test_acc = round(self.rng.uniform(0.28, 0.36), 4)
	val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.12), 4)
	test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.12), 4)
	else:
	val_acc = round(self.rng.uniform(0.78, 0.91), 4)
	test_acc = round(val_acc - self.rng.uniform(0.02, 0.05), 4)
	val_loss = round(1 - val_acc + 0.1, 4)
	test_loss = round(1 - test_acc + 0.1, 4)

	result = {
	"run_id": self.run_id,
	"final_epoch": self.epochs if bug not in ("exploding_lr",) else self.rng.randint(2,5),
	"metrics": {
	"val_loss": val_loss,
	"val_accuracy": val_acc,
	"test_loss": test_loss,
	"test_accuracy": test_acc,
	},
	"best_checkpoint": f"./checkpoints/{self.run_id}/best_model.pt",
	"evaluation_timestamp": f"2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(10,22):02d}:{self.rng.randint(0,59):02d}:00Z",
	"hardware": {"gpu": "A100-40GB", "cuda": "12.1"},
	}
	return json.dumps(result, indent=2)

	# ── model_card.json ──────────────────────────────────────────────────────

	def _gen_model_card(self) -> str:
	bug = self.bug.bug_type
	tokenizer_ver = "v1" if bug == "tokenizer_version_drift" else "v2"

	card = {
	"model_id": f"{self.run_id}",
	"architecture": self.model_cfg["name"],
	"task": self.model_cfg["type"],
	"num_parameters": self.model_cfg["params"],
	"dataset": self.model_cfg["dataset"],
	"num_classes": self.model_cfg["num_classes"],
	"framework": "PyTorch 2.2.0",
	"training_config": {
	"optimizer": self.optimizer_name,
	"scheduler": self.scheduler_name,
	"epochs": self.epochs,
	},
	"preprocessing": {
	"label_encoder": "sklearn.LabelEncoder",
	"tokenizer": tokenizer_ver if "bert" in self.model_cfg["name"].lower() else "N/A",
	"normalizer": "StandardScaler (fit on training split)",
	},
	"authors": ["ml-platform-team"],
	"license": "Apache-2.0",
	}
	return json.dumps(card, indent=2)


	# ─── Sanity Check Engine ──────────────────────────────────────────────────────

	def run_sanity_check(check_type: str, bug_type: str, artifacts: Dict[str, str],
	rng: random.Random) -> Dict:
	"""
	Runs a named diagnostic check and returns computed results.
	Results are grounded in the generated artifacts — not random.
	"""
	bug = BUG_CATALOGUE[bug_type]

	if check_type == "label_consistency":
	if bug_type == "label_encoder_mismatch":
	return {
	"check": "label_consistency",
	"result": "FAIL",
	"details": "Training LabelEncoder class order: ['bird', 'cat', 'dog'] (index 0=bird, 1=cat, 2=dog). "
	"Evaluation LabelEncoder class order: ['cat', 'dog', 'bird'] (index 0=cat, 1=dog, 2=bird). "
	"Mismatch detected — 2 of 3 class indices differ between pipelines.",
	"affected_classes": 2,
	"recommendation": "Use a single LabelEncoder instance across both pipelines.",
	}
	return {"check": "label_consistency", "result": "PASS",
	"details": "Train and eval label mappings are identical. No mismatch detected."}

	elif check_type == "data_leakage":
	if bug_type in ("data_leakage_overlap", "data_leakage_scaler"):
	overlap = rng.randint(180, 450) if bug_type == "data_leakage_overlap" else 0
	scaler_leak = bug_type == "data_leakage_scaler"
	return {
	"check": "data_leakage",
	"result": "FAIL",
	"sample_overlap": overlap,
	"scaler_fitted_on_full_dataset": scaler_leak,
	"details": (
	f"Found {overlap} samples present in both train and val splits. "
	if overlap > 0 else ""
	) + (
	"StandardScaler.fit_transform() called on full dataset before split — "
	"validation statistics contaminated by training distribution."
	if scaler_leak else ""
	),
	}
	return {"check": "data_leakage", "result": "PASS",
	"sample_overlap": 0, "scaler_fitted_on_full_dataset": False,
	"details": "No data leakage detected between train and val splits."}

	elif check_type == "gradient_norms":
	if bug_type == "exploding_lr":
	return {
	"check": "gradient_norms",
	"result": "ANOMALY",
	"epoch_1_norm": round(rng.uniform(840.0, 2100.0), 2),
	"expected_range": "0.1 – 10.0",
	"details": "Gradient norms exceeded safe threshold by 100–200×. "
	"Indicates learning rate is too large — gradients are not being controlled.",
	}
	return {"check": "gradient_norms", "result": "NORMAL",
	"mean_norm": round(rng.uniform(0.3, 2.1), 3),
	"max_norm": round(rng.uniform(2.1, 4.5), 3),
	"details": "Gradient norms are within expected range throughout training."}

	elif check_type == "metric_gap_analysis":
	if bug_type in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
	val_acc = round(rng.uniform(0.84, 0.91), 4)
	test_acc = round(rng.uniform(0.28, 0.38), 4)
	return {
	"check": "metric_gap_analysis",
	"result": "ANOMALY",
	"val_accuracy": val_acc,
	"test_accuracy": test_acc,
	"gap": round(val_acc - test_acc, 4),
	"expected_max_gap": 0.08,
	"details": f"Val/test accuracy gap is {val_acc - test_acc:.3f} — far exceeds expected max of 0.08. "
	f"This magnitude of gap (>{val_acc - test_acc:.0%}) strongly suggests an evaluation pipeline bug "
	f"rather than overfitting — the model generalises well to the val set but fails on test data.",
	}
	return {"check": "metric_gap_analysis", "result": "NORMAL",
	"details": "Val/test metric gap is within normal bounds."}

	elif check_type == "encoder_version_match":
	if bug_type == "tokenizer_version_drift":
	return {
	"check": "encoder_version_match",
	"result": "MISMATCH",
	"training_tokenizer": "bert-base-uncased-v2-fixed",
	"eval_tokenizer": "bert-base-uncased",
	"vocab_diff": 847,
	"details": "Training uses tokenizer v2 (30,522 + 847 domain tokens). "
	"Evaluation uses tokenizer v1 (30,522 tokens). "
	"847 domain-specific tokens will map to [UNK] during evaluation — "
	"causing silent degradation on domain-specific test inputs.",
	}
	return {"check": "encoder_version_match", "result": "PASS",
	"details": "Training and evaluation use identical tokenizer versions."}

	elif check_type == "class_balance":
	n_classes = 10
	counts = {str(i): rng.randint(780, 1020) for i in range(n_classes)}
	imbalance_ratio = max(counts.values()) / max(1, min(counts.values()))
	return {
	"check": "class_balance",
	"result": "PASS" if imbalance_ratio < 1.5 else "WARN",
	"class_counts": counts,
	"imbalance_ratio": round(imbalance_ratio, 3),
	"details": f"Max/min class ratio: {imbalance_ratio:.2f}. "
	f"{'Within acceptable range.' if imbalance_ratio < 1.5 else 'Moderate imbalance — consider weighted loss.'}",
	}

	elif check_type == "loss_trajectory":
	if bug_type == "exploding_lr":
	return {
	"check": "loss_trajectory",
	"result": "ANOMALY",
	"pattern": "exponential_divergence",
	"loss_values": [2.31, 18.42, 847.2, "nan"],
	"details": "Loss follows exponential growth pattern rather than convergence. "
	"This is a strong indicator of learning rate being orders of magnitude too large.",
	}
	elif bug_type == "wrong_optimizer":
	return {
	"check": "loss_trajectory",
	"result": "ANOMALY",
	"pattern": "oscillating_no_convergence",
	"details": "Loss oscillates without converging over all epochs. "
	"Characteristic of excessive momentum causing the optimizer to overshoot minima repeatedly.",
	}
	return {"check": "loss_trajectory", "result": "NORMAL",
	"pattern": "smooth_convergence",
	"details": "Loss follows expected convergence curve."}

	elif check_type == "feature_statistics":
	if bug_type in ("data_leakage_scaler",):
	return {
	"check": "feature_statistics",
	"result": "WARN",
	"train_mean": 0.0, "train_std": 1.0,
	"val_mean": 0.0, "val_std": 1.0,
	"details": "Train and val feature statistics are identical after normalization — "
	"this is expected if scaler was fit on the full dataset (including val). "
	"If scaler was fit only on train, a slight distributional shift is normal. "
	"Zero shift suggests the scaler saw val data during fitting.",
	}
	return {"check": "feature_statistics", "result": "PASS",
	"details": "Train and val feature distributions are within expected divergence bounds."}

	return {"check": check_type, "result": "UNKNOWN",
	"details": f"Unknown sanity check type: {check_type}"}