Upload src

2957021 verified 3 days ago

30.6 kB

	"""
	MINDI 1.5 Vision-Coder — Trainer

	Production-ready 3-phase training loop optimized for AMD MI300X (192GB VRAM).
	Streams training data from disk (4.18GB train.jsonl) to avoid RAM exhaustion.

	Phases:
	Phase 1 (steps 0–5000): LoRA only, LR 2e-4, batch 16
	Phase 2 (steps 5000–7500): Vision bridge only, LR 1e-5, batch 8
	Phase 3 (steps 7500–10000): All trainable, LR 5e-5, batch 12

	MI300X specifics:
	- ROCm presents as CUDA to PyTorch (torch.cuda.* works)
	- bf16 (NOT fp16) for AMD stability
	- torch.compile() optional (works on ROCm)
	- Gradient checkpointing enabled
	- DataLoader: num_workers=4, pin_memory=True, prefetch_factor=2
	"""

	from __future__ import annotations

	import json
	import math
	import sys
	import time
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any, Iterator, Optional

	# Force unbuffered stdout for docker exec -d log visibility
	if not sys.stdout.line_buffering:
	sys.stdout.reconfigure(line_buffering=True)

	import torch
	import torch.nn as nn
	from PIL import Image
	from torch.optim import AdamW
	from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
	from torch.utils.data import DataLoader, IterableDataset

	# ── Configuration ─────────────────────────────────────────────────────

	PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent


	@dataclass
	class PhaseConfig:
	"""Configuration for a single training phase."""
	name: str
	start_step: int
	end_step: int
	learning_rate: float
	batch_size: int
	gradient_accumulation_steps: int = 4
	# Component toggles
	lora: bool = False
	vision_projection: bool = False
	fusion: bool = False
	# Data type: "text" for code-only, "vision" for image+code, "mixed" for both
	data_type: str = "text"


	@dataclass
	class TrainingConfig:
	"""Full training configuration."""

	# Data paths
	train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "train.jsonl")
	val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "val.jsonl")
	vision_train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "websight" / "train.jsonl")
	vision_val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "websight" / "val.jsonl")

	# Output
	output_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "checkpoints" / "training")
	log_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "logs" / "training")

	# Model
	max_seq_length: int = 8192
	use_compile: bool = False
	gradient_checkpointing: bool = True

	# Hardware (MI300X defaults)
	dtype: str = "bf16"
	num_workers: int = 4
	pin_memory: bool = True
	prefetch_factor: int = 2

	# Training
	weight_decay: float = 0.01
	warmup_ratio: float = 0.03
	max_grad_norm: float = 1.0
	seed: int = 42

	# Logging
	log_every_n_steps: int = 10
	eval_every_n_steps: int = 250
	save_every_n_steps: int = 500

	# Phases
	phases: list[PhaseConfig] = field(default_factory=lambda: [
	PhaseConfig(
	name="phase1_lora",
	start_step=0, end_step=5000,
	learning_rate=2e-4, batch_size=16,
	lora=True, vision_projection=False, fusion=False,
	data_type="text",
	),
	PhaseConfig(
	name="phase2_vision_bridge",
	start_step=5000, end_step=7500,
	learning_rate=1e-5, batch_size=8,
	lora=False, vision_projection=True, fusion=True,
	data_type="vision",
	),
	PhaseConfig(
	name="phase3_all",
	start_step=7500, end_step=10000,
	learning_rate=5e-5, batch_size=12,
	lora=True, vision_projection=True, fusion=True,
	data_type="mixed",
	),
	])

	@property
	def total_steps(self) -> int:
	return self.phases[-1].end_step if self.phases else 0

	@property
	def torch_dtype(self) -> torch.dtype:
	return {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.dtype]


	# ── Streaming Dataset ─────────────────────────────────────────────────

	class StreamingJSONLDataset(IterableDataset):
	"""
	Streams JSONL training data from disk line by line.
	Tokenizes on-the-fly to avoid loading 4+ GB into RAM.
	Supports optional image loading for vision-code pairs.

	Expected JSONL format:
	{"id": "...", "type": "...", "source": "...",
	"image_path": "data/websight/images/ws_0000001.jpg", (optional)
	"messages": [{"role": "system", "content": "..."},
	{"role": "user", "content": "..."},
	{"role": "assistant", "content": "..."}],
	"metadata": {...}}
	"""

	def __init__(
	self,
	file_path: Path,
	tokenizer: Any,
	max_length: int = 8192,
	shuffle_buffer: int = 10000,
	seed: int = 42,
	project_root: Optional[Path] = None,
	) -> None:
	self.file_path = Path(file_path)
	self.tokenizer = tokenizer
	self.max_length = max_length
	self.shuffle_buffer = shuffle_buffer
	self.seed = seed
	self.project_root = Path(project_root) if project_root else PROJECT_ROOT

	if not self.file_path.exists():
	raise FileNotFoundError(f"Training data not found: {self.file_path}")

	def _format_messages(self, messages: list[dict[str, str]]) -> str:
	"""Format chat messages into a single training string."""
	# Use the tokenizer's chat template if available
	if hasattr(self.tokenizer, "apply_chat_template"):
	return self.tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=False
	)
	# Fallback: simple concatenation
	parts = []
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	parts.append(f"<\|{role}\|>\n{content}")
	return "\n".join(parts)

	def _tokenize(self, text: str) -> Optional[dict[str, torch.Tensor]]:
	"""Tokenize text and create training labels."""
	encoded = self.tokenizer(
	text,
	max_length=self.max_length,
	truncation=True,
	padding="max_length",
	return_tensors="pt",
	)
	input_ids = encoded["input_ids"].squeeze(0)
	attention_mask = encoded["attention_mask"].squeeze(0)

	# Labels = input_ids, with padding tokens masked as -100
	labels = input_ids.clone()
	labels[attention_mask == 0] = -100

	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"labels": labels,
	}

	def _line_iterator(self) -> Iterator[dict]:
	"""Iterate over JSONL file line by line."""
	with open(self.file_path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if line:
	yield json.loads(line)

	def _shuffled_iterator(self) -> Iterator[dict]:
	"""Reservoir-style shuffle buffer for streaming data."""
	import random
	rng = random.Random(self.seed)
	buffer: list[dict] = []

	for item in self._line_iterator():
	buffer.append(item)
	if len(buffer) >= self.shuffle_buffer:
	rng.shuffle(buffer)
	yield from buffer
	buffer.clear()

	# Flush remaining items
	if buffer:
	rng.shuffle(buffer)
	yield from buffer

	def _load_image(self, image_path: str) -> Optional[Image.Image]:
	"""Load image from a relative path. Returns None if missing/corrupt."""
	try:
	full_path = self.project_root / image_path
	if full_path.exists():
	img = Image.open(str(full_path)).convert("RGB")
	return img
	except Exception:
	pass
	return None

	def __iter__(self) -> Iterator[dict[str, Any]]:
	for example in self._shuffled_iterator():
	messages = example.get("messages", [])
	if not messages:
	continue
	text = self._format_messages(messages)
	tokenized = self._tokenize(text)
	if tokenized is not None:
	# Load image if path present
	image_path = example.get("image_path")
	if image_path:
	tokenized["image"] = self._load_image(image_path)
	else:
	tokenized["image"] = None
	yield tokenized

	def count_lines(self) -> int:
	"""Count total lines (for progress estimation). Reads file once."""
	count = 0
	with open(self.file_path, "r", encoding="utf-8") as f:
	for _ in f:
	count += 1
	return count


	# ── Trainer ───────────────────────────────────────────────────────────

	class MINDITrainer:
	"""
	3-phase trainer for MINDI 1.5 Vision-Coder.

	Optimized for AMD MI300X 192GB:
	- bf16 mixed precision
	- Gradient checkpointing
	- Streaming data from disk
	- Optional torch.compile()
	- Phase-based component freezing/unfreezing
	"""

	def __init__(
	self,
	model: nn.Module,
	config: TrainingConfig,
	) -> None:
	"""
	Initialize the trainer.

	Args:
	model: MINDI15 model instance (already initialized).
	config: Training configuration.
	"""
	self.model = model
	self.config = config
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.global_step = 0
	self.best_val_loss = float("inf")

	# Create output directories
	self.config.output_dir.mkdir(parents=True, exist_ok=True)
	self.config.log_dir.mkdir(parents=True, exist_ok=True)

	# Gradient checkpointing
	if config.gradient_checkpointing:
	base_model = self.model.architecture.get_model()
	if hasattr(base_model, "gradient_checkpointing_enable"):
	base_model.gradient_checkpointing_enable()
	print("[MINDITrainer] Gradient checkpointing enabled")
	# Required for PEFT/LoRA + gradient checkpointing without torch.compile
	if hasattr(self.model.llm, "enable_input_require_grads"):
	self.model.llm.enable_input_require_grads()
	else:
	def _make_inputs_require_grad(module, input, output):
	output.requires_grad_(True)
	self.model.llm.get_input_embeddings().register_forward_hook(_make_inputs_require_grad)

	# Optional torch.compile (works on ROCm)
	if config.use_compile:
	print("[MINDITrainer] Compiling model with torch.compile() ...")
	self.model.llm = torch.compile(self.model.llm)
	self.model.architecture.peft_model = self.model.llm
	print("[MINDITrainer] Compilation complete")

	# Mixed precision scaler (bf16 doesn't need GradScaler, but keep structure)
	self.use_amp = config.dtype in ("bf16", "fp16")
	self.amp_dtype = config.torch_dtype

	# Training log
	self.log_file = config.log_dir / "training_log.jsonl"
	self.metrics_history: list[dict] = []

	print(f"[MINDITrainer] Device: {self.device}")
	print(f"[MINDITrainer] Dtype: {config.dtype}")
	print(f"[MINDITrainer] Total steps: {config.total_steps}")
	print(f"[MINDITrainer] Phases: {len(config.phases)}")

	def _build_optimizer(self, phase: PhaseConfig) -> AdamW:
	"""Build optimizer for the current phase (only trainable params)."""
	params = [p for p in self.model.parameters() if p.requires_grad]
	if not params:
	raise RuntimeError(f"No trainable parameters in phase '{phase.name}'")
	return AdamW(
	params,
	lr=phase.learning_rate,
	weight_decay=self.config.weight_decay,
	betas=(0.9, 0.95),
	)

	def _build_scheduler(
	self, optimizer: AdamW, phase: PhaseConfig
	) -> torch.optim.lr_scheduler.LRScheduler:
	"""Build LR scheduler: linear warmup + cosine decay."""
	phase_steps = phase.end_step - phase.start_step
	warmup_steps = max(1, int(phase_steps * self.config.warmup_ratio))
	decay_steps = max(1, phase_steps - warmup_steps)

	warmup = LinearLR(
	optimizer,
	start_factor=0.01,
	end_factor=1.0,
	total_iters=warmup_steps,
	)
	cosine = CosineAnnealingLR(
	optimizer,
	T_max=decay_steps,
	eta_min=phase.learning_rate * 0.1,
	)
	return SequentialLR(
	optimizer,
	schedulers=[warmup, cosine],
	milestones=[warmup_steps],
	)

	def _build_dataloader(
	self, file_path: Path, batch_size: int, shuffle_buffer: int = 10000
	) -> DataLoader:
	"""Build a streaming DataLoader."""
	dataset = StreamingJSONLDataset(
	file_path=file_path,
	tokenizer=self.model.tokenizer,
	max_length=self.config.max_seq_length,
	shuffle_buffer=shuffle_buffer,
	seed=self.config.seed,
	)

	def _collate_fn(batch):
	"""Custom collate: stack tensors, keep images as list."""
	collated = {
	"input_ids": torch.stack([b["input_ids"] for b in batch]),
	"attention_mask": torch.stack([b["attention_mask"] for b in batch]),
	"labels": torch.stack([b["labels"] for b in batch]),
	"images": [b.get("image") for b in batch],
	}
	return collated

	return DataLoader(
	dataset,
	batch_size=batch_size,
	num_workers=self.config.num_workers,
	pin_memory=self.config.pin_memory,
	prefetch_factor=self.config.prefetch_factor if self.config.num_workers > 0 else None,
	drop_last=True,
	collate_fn=_collate_fn,
	)

	def _log_metrics(self, metrics: dict) -> None:
	"""Append metrics to log file and history."""
	self.metrics_history.append(metrics)
	with open(self.log_file, "a", encoding="utf-8") as f:
	f.write(json.dumps(metrics) + "\n")

	@torch.no_grad()
	def evaluate(self, val_loader: DataLoader, max_batches: int = 50) -> float:
	"""
	Run validation and return average loss.

	Args:
	val_loader: Validation DataLoader.
	max_batches: Maximum batches to evaluate (for speed).

	Returns:
	Average validation loss.
	"""
	self.model.eval()
	total_loss = 0.0
	count = 0

	for batch_idx, batch in enumerate(val_loader):
	if batch_idx >= max_batches:
	break

	input_ids = batch["input_ids"].to(self.device)
	attention_mask = batch["attention_mask"].to(self.device)
	labels = batch["labels"].to(self.device)
	images = batch.get("images")
	image = None
	if images:
	for img in images:
	if img is not None:
	image = img
	break

	with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
	result = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	labels=labels,
	image=image,
	)

	if result["loss"] is not None:
	total_loss += result["loss"].item()
	count += 1

	self.model.train()
	return total_loss / max(count, 1)

	def _save_checkpoint(self, phase_name: str, step: int, val_loss: float) -> Path:
	"""Save a training checkpoint."""
	ckpt_dir = self.config.output_dir / f"{phase_name}_step{step}"
	ckpt_dir.mkdir(parents=True, exist_ok=True)

	# Save model weights
	self.model.save(ckpt_dir)

	# Save trainer state
	state = {
	"global_step": self.global_step,
	"phase": phase_name,
	"step_in_phase": step,
	"val_loss": val_loss,
	"best_val_loss": self.best_val_loss,
	}
	torch.save(state, ckpt_dir / "trainer_state.pt")

	print(f"[MINDITrainer] Checkpoint saved: {ckpt_dir}")
	return ckpt_dir

	def train_phase(self, phase: PhaseConfig) -> dict:
	"""
	Execute a single training phase.

	Args:
	phase: Phase configuration.

	Returns:
	Dict with phase training metrics.
	"""
	print()
	print("=" * 60)
	print(f" Phase: {phase.name}")
	print(f" Steps: {phase.start_step} → {phase.end_step}")
	print(f" LR: {phase.learning_rate} \| Batch: {phase.batch_size}")
	print(f" Components: LoRA={phase.lora}, Vision={phase.vision_projection}, "
	f"Fusion={phase.fusion}")
	print(f" Data: {phase.data_type}")
	print("=" * 60)

	# Set trainable components
	self.model.set_trainable_components(
	lora=phase.lora,
	vision_projection=phase.vision_projection,
	fusion=phase.fusion,
	)

	# Build optimizer and scheduler for this phase
	optimizer = self._build_optimizer(phase)
	scheduler = self._build_scheduler(optimizer, phase)

	# Select data files based on phase data_type
	if phase.data_type == "vision":
	train_file = self.config.vision_train_file
	val_file = self.config.vision_val_file
	else:
	# "text" or "mixed" — use main data (mixed has images inline)
	train_file = self.config.train_file
	val_file = self.config.val_file

	# Build data loaders
	train_loader = self._build_dataloader(
	train_file, phase.batch_size
	)
	val_loader = self._build_dataloader(
	val_file, batch_size=max(phase.batch_size // 2, 1),
	shuffle_buffer=1000,
	)

	self.model.train()
	phase_steps = phase.end_step - phase.start_step
	step_in_phase = 0
	accum_loss = 0.0
	accum_count = 0
	phase_start_time = time.time()

	train_iter = iter(train_loader)

	while step_in_phase < phase_steps:
	# Get next batch (restart iterator if exhausted = new epoch)
	try:
	batch = next(train_iter)
	except StopIteration:
	train_iter = iter(train_loader)
	batch = next(train_iter)

	input_ids = batch["input_ids"].to(self.device)
	attention_mask = batch["attention_mask"].to(self.device)
	labels = batch["labels"].to(self.device)
	images = batch.get("images") # list of PIL Images or Nones

	# Pick first non-None image in batch (model processes one image at a time)
	image = None
	if images:
	for img in images:
	if img is not None:
	image = img
	break

	# Forward pass with mixed precision
	with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
	result = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	labels=labels,
	image=image,
	)
	loss = result["loss"]

	if loss is None:
	continue

	# Scale loss for gradient accumulation
	loss = loss / phase.gradient_accumulation_steps

	# Backward pass
	loss.backward()
	accum_loss += loss.item() * phase.gradient_accumulation_steps
	accum_count += 1

	# Optimizer step (every gradient_accumulation_steps)
	if accum_count % phase.gradient_accumulation_steps == 0:
	# Gradient clipping
	torch.nn.utils.clip_grad_norm_(
	[p for p in self.model.parameters() if p.requires_grad],
	self.config.max_grad_norm,
	)
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()

	step_in_phase += 1
	self.global_step += 1
	avg_loss = accum_loss / phase.gradient_accumulation_steps
	accum_loss = 0.0

	# Logging
	if step_in_phase % self.config.log_every_n_steps == 0:
	elapsed = time.time() - phase_start_time
	steps_per_sec = step_in_phase / elapsed if elapsed > 0 else 0.0
	eta_sec = (phase_steps - step_in_phase) / steps_per_sec if steps_per_sec > 0 else 0.0

	metrics = {
	"phase": phase.name,
	"global_step": self.global_step,
	"step_in_phase": step_in_phase,
	"loss": round(avg_loss, 4),
	"lr": optimizer.param_groups[0]["lr"],
	"steps_per_sec": round(steps_per_sec, 3),
	"eta_minutes": round(eta_sec / 60, 1),
	"elapsed_minutes": round(elapsed / 60, 1),
	}
	self._log_metrics(metrics)
	print(f" [{phase.name}] step {step_in_phase}/{phase_steps} \| "
	f"loss={avg_loss:.4f} \| "
	f"lr={optimizer.param_groups[0]['lr']:.2e} \| "
	f"speed={steps_per_sec:.2f} steps/s \| "
	f"ETA={eta_sec / 60:.1f}min")

	# Evaluation
	if step_in_phase % self.config.eval_every_n_steps == 0:
	val_loss = self.evaluate(val_loader)
	print(f" [{phase.name}] EVAL step {step_in_phase} \| val_loss={val_loss:.4f}")
	self._log_metrics({
	"phase": phase.name,
	"global_step": self.global_step,
	"val_loss": round(val_loss, 4),
	"type": "eval",
	})

	# Save best model
	if val_loss < self.best_val_loss:
	self.best_val_loss = val_loss
	self._save_checkpoint(phase.name, step_in_phase, val_loss)
	print(f" [{phase.name}] New best val_loss: {val_loss:.4f}")

	# Periodic save
	if step_in_phase % self.config.save_every_n_steps == 0:
	self._save_checkpoint(phase.name, step_in_phase, self.best_val_loss)

	# End-of-phase save
	phase_elapsed = time.time() - phase_start_time
	self._save_checkpoint(phase.name, step_in_phase, self.best_val_loss)

	phase_summary = {
	"phase": phase.name,
	"total_steps": step_in_phase,
	"elapsed_minutes": round(phase_elapsed / 60, 1),
	"best_val_loss": round(self.best_val_loss, 4),
	"type": "phase_complete",
	}
	self._log_metrics(phase_summary)
	print(f"\n [{phase.name}] Complete — {step_in_phase} steps in "
	f"{phase_elapsed / 60:.1f} min")

	return phase_summary

	def train(self) -> dict:
	"""
	Run all 3 training phases sequentially.

	Returns:
	Dict with complete training summary.
	"""
	print()
	print("=" * 60)
	print(" MINDI 1.5 — Training Start")
	print(f" Total phases: {len(self.config.phases)}")
	print(f" Total steps: {self.config.total_steps}")
	print(f" Device: {self.device}")
	print(f" Dtype: {self.config.dtype}")
	print(f" Output: {self.config.output_dir}")
	print("=" * 60)

	torch.manual_seed(self.config.seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(self.config.seed)

	training_start = time.time()
	phase_summaries = []

	for phase in self.config.phases:
	summary = self.train_phase(phase)
	phase_summaries.append(summary)

	total_elapsed = time.time() - training_start

	# Final save
	final_dir = self.config.output_dir / "final"
	final_dir.mkdir(parents=True, exist_ok=True)
	self.model.save(final_dir)

	training_summary = {
	"total_steps": self.global_step,
	"total_minutes": round(total_elapsed / 60, 1),
	"best_val_loss": round(self.best_val_loss, 4),
	"phases": phase_summaries,
	"type": "training_complete",
	}
	self._log_metrics(training_summary)

	print()
	print("=" * 60)
	print(" MINDI 1.5 — Training Complete")
	print(f" Total steps: {self.global_step}")
	print(f" Total time: {total_elapsed / 60:.1f} minutes")
	print(f" Best val loss: {self.best_val_loss:.4f}")
	print(f" Final saved to: {final_dir}")
	print("=" * 60)

	return training_summary

	def resume_from_checkpoint(self, checkpoint_dir: Path) -> None:
	"""
	Resume training from a checkpoint.

	Args:
	checkpoint_dir: Directory containing saved checkpoint.
	"""
	checkpoint_dir = Path(checkpoint_dir)
	state_file = checkpoint_dir / "trainer_state.pt"

	if not state_file.exists():
	raise FileNotFoundError(f"Trainer state not found: {state_file}")

	# Load model weights
	self.model.load(checkpoint_dir)

	# Load trainer state
	state = torch.load(state_file, map_location=self.device, weights_only=True)
	self.global_step = state["global_step"]
	self.best_val_loss = state["best_val_loss"]

	print(f"[MINDITrainer] Resumed from step {self.global_step} "
	f"(val_loss={self.best_val_loss:.4f})")


	# ── Test block ────────────────────────────────────────────────────────
	if __name__ == "__main__":
	print("=" * 60)
	print(" MINDI 1.5 — Trainer Test")
	print("=" * 60)
	print()

	# ── Test 1: Config defaults ──────────────────────────────────
	print(" Test 1: TrainingConfig defaults")
	config = TrainingConfig()
	assert config.total_steps == 10000
	assert config.dtype == "bf16"
	assert config.torch_dtype == torch.bfloat16
	assert len(config.phases) == 3
	assert config.gradient_checkpointing is True
	assert config.num_workers == 4
	assert config.pin_memory is True
	assert config.prefetch_factor == 2
	print(f" Total steps: {config.total_steps}")
	print(f" Dtype: {config.dtype}")
	print(f" Phases: {[p.name for p in config.phases]}")
	print(" ✓ Config defaults correct")

	# ── Test 2: Phase configs ────────────────────────────────────
	print("\n Test 2: Phase configurations")
	p1, p2, p3 = config.phases
	assert p1.name == "phase1_lora"
	assert p1.batch_size == 16
	assert p1.learning_rate == 2e-4
	assert p1.lora is True and p1.vision_projection is False and p1.fusion is False

	assert p2.name == "phase2_vision_bridge"
	assert p2.batch_size == 8
	assert p2.learning_rate == 1e-5
	assert p2.lora is False and p2.vision_projection is True and p2.fusion is True

	assert p3.name == "phase3_all"
	assert p3.batch_size == 12
	assert p3.learning_rate == 5e-5
	assert p3.lora is True and p3.vision_projection is True and p3.fusion is True
	print(" Phase 1: LoRA only, batch=16, lr=2e-4 ✓")
	print(" Phase 2: Vision bridge, batch=8, lr=1e-5 ✓")
	print(" Phase 3: All, batch=12, lr=5e-5 ✓")

	# ── Test 3: Streaming dataset (if data exists) ───────────────
	print("\n Test 3: StreamingJSONLDataset")
	train_path = config.train_file
	if train_path.exists():
	from transformers import AutoTokenizer
	tok = AutoTokenizer.from_pretrained(
	str(PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"),
	trust_remote_code=True,
	)
	dataset = StreamingJSONLDataset(
	file_path=train_path,
	tokenizer=tok,
	max_length=512, # small for test
	shuffle_buffer=100,
	)
	count = 0
	for item in dataset:
	assert "input_ids" in item
	assert "attention_mask" in item
	assert "labels" in item
	assert item["input_ids"].shape[0] == 512
	count += 1
	if count >= 5:
	break
	print(f" Streamed {count} examples, shape={item['input_ids'].shape} ✓")
	else:
	print(f" [SKIP] Train file not found: {train_path}")

	# ── Test 4: PhaseConfig step ranges ──────────────────────────
	print("\n Test 4: Phase step continuity")
	for i in range(1, len(config.phases)):
	prev = config.phases[i - 1]
	curr = config.phases[i]
	assert prev.end_step == curr.start_step, \
	f"Gap between {prev.name} and {curr.name}"
	print(" All phases are contiguous ✓")

	# ── Test 5: Gradient accumulation ────────────────────────────
	print("\n Test 5: Gradient accumulation steps")
	for phase in config.phases:
	assert phase.gradient_accumulation_steps == 4
	print(" All phases: grad_accum=4 ✓")

	print("\n ✓ All trainer tests passed!")
	print("=" * 60)