chore: initial public release of Humigence (CLI wizard + dual-GPU fine-tuning)

7275aef 5 months ago

11 kB

	"""
	Fallback simulation loop for auto-healing validation failures
	"""

	import re
	import torch
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple, Any
	from rich.console import Console
	from rich.table import Table

	from .dryrun import dry_run, DryRunResult
	from .matrix import get_gpu_info, precision_supported, has_bitsandbytes
	from training.autodetect import suggested_lora_targets

	console = Console()


	@dataclass
	class ConfigCandidate:
	"""Represents a configuration candidate for testing"""
	model: str
	precision: str
	seq_len: int
	batch_size: int
	lora: bool
	lora_targets: Optional[List[str]] = None
	gradient_checkpointing: bool = False
	dataset: str = "wikitext"
	text_field: Optional[str] = None


	@dataclass
	class FallbackAttempt:
	"""Represents a single fallback attempt"""
	attempt_num: int
	config: ConfigCandidate
	result: DryRunResult
	strategy: str
	notes: str


	class FallbackSimulator:
	"""Handles fallback simulation and auto-healing"""

	def __init__(self):
	try:
	self.gpu = get_gpu_info()
	except Exception:
	# If GPU info fails, create a fallback GPU info
	self.gpu = type('GpuInfo', (), {
	'available': True,
	'name': 'Unknown GPU',
	'total_bytes': 0,
	'free_bytes': 0,
	'cc_major': 7,
	'cc_minor': 0,
	'bf16_supported': True
	})()
	self.attempts: List[FallbackAttempt] = []

	def reset_gpu_state(self):
	"""Reset GPU state to clear any CUDA errors"""
	try:
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	except Exception:
	pass # Ignore errors during reset

	def classify_error(self, error: str) -> str:
	"""Classify error type from error message"""
	error_lower = error.lower()

	if "out of memory" in error_lower or "oom" in error_lower:
	return "oom"
	elif "bf16" in error_lower and "not supported" in error_lower:
	return "precision"
	elif "fp16" in error_lower and "not supported" in error_lower:
	return "precision"
	elif "4-bit" in error_lower and "not supported" in error_lower:
	return "precision"
	elif "bitsandbytes" in error_lower:
	return "precision"
	elif "seq_len" in error_lower and "model limit" in error_lower:
	return "seq_len"
	elif "position" in error_lower and "embedding" in error_lower:
	return "seq_len"
	elif "lora" in error_lower and "target" in error_lower:
	return "lora"
	elif "cuda error" in error_lower and "assert" in error_lower:
	return "seq_len" # Often caused by seq_len overflow
	else:
	return "unknown"

	def apply_fallback_strategy(self, config: ConfigCandidate, error_type: str) -> Optional[ConfigCandidate]:
	"""Apply fallback strategy based on error type"""
	new_config = ConfigCandidate(
	model=config.model,
	precision=config.precision,
	seq_len=config.seq_len,
	batch_size=config.batch_size,
	lora=config.lora,
	lora_targets=config.lora_targets,
	gradient_checkpointing=config.gradient_checkpointing,
	dataset=config.dataset,
	text_field=config.text_field
	)

	if error_type == "precision":
	# Precision fallback chain: bf16 -> fp16 -> qlora4bit -> fp16+grad_checkpoint
	if config.precision == "bf16" and not self.gpu.bf16_supported:
	new_config.precision = "fp16"
	return new_config
	elif config.precision == "qlora4bit" and not has_bitsandbytes():
	new_config.precision = "fp16"
	return new_config
	elif config.precision == "fp16" and not self.gpu.available:
	new_config.precision = "fp32"
	return new_config
	elif config.precision in ["bf16", "fp16"] and not self.gpu.available:
	new_config.precision = "fp32"
	return new_config

	elif error_type == "oom":
	# OOM fallback chain: reduce batch -> enable grad checkpoint -> reduce seq_len -> change precision
	if config.batch_size > 1:
	new_config.batch_size = max(1, config.batch_size // 2)
	return new_config
	elif not config.gradient_checkpointing:
	new_config.gradient_checkpointing = True
	return new_config
	elif config.seq_len > 512:
	new_config.seq_len = max(512, config.seq_len // 2)
	return new_config
	elif config.precision in ["bf16", "fp32"]:
	new_config.precision = "fp16"
	return new_config
	elif config.precision == "fp16" and has_bitsandbytes() and self.gpu.available:
	new_config.precision = "qlora4bit"
	return new_config

	elif error_type == "seq_len":
	# Sequence length fallback: reduce to model limit or reasonable default
	if config.seq_len > 1024:
	new_config.seq_len = 1024
	return new_config
	elif config.seq_len > 512:
	new_config.seq_len = 512
	return new_config

	elif error_type == "lora":
	# LoRA fallback: try default target modules
	if config.lora and config.lora_targets:
	new_config.lora_targets = ["q_proj", "v_proj"]
	return new_config

	return None # No more fallbacks available

	def simulate_fallbacks(self, initial_config: ConfigCandidate, max_attempts: int = 10) -> Tuple[bool, Optional[ConfigCandidate]]:
	"""Simulate fallback attempts until success or max attempts reached"""
	current_config = initial_config
	attempt_num = 0

	console.print(f"\n[bold blue]🔄 Starting Auto-Heal Simulation Loop[/bold blue]")
	console.print(f"[dim]Max attempts: {max_attempts}[/dim]\n")

	# Create attempts table
	attempts_table = Table(title="Fallback Simulation Attempts")
	attempts_table.add_column("Attempt", style="cyan", width=8)
	attempts_table.add_column("Precision", style="white", width=10)
	attempts_table.add_column("Seq Len", style="white", width=8)
	attempts_table.add_column("Batch", style="white", width=6)
	attempts_table.add_column("LoRA", style="white", width=6)
	attempts_table.add_column("Grad Check", style="white", width=10)
	attempts_table.add_column("Result", style="white", width=8)
	attempts_table.add_column("Strategy", style="yellow", width=20)

	while attempt_num < max_attempts:
	attempt_num += 1

	# Reset GPU state before each attempt
	self.reset_gpu_state()

	# Run dry-run test
	result = dry_run(
	model_id_or_path=current_config.model,
	precision=current_config.precision,
	seq_len=current_config.seq_len,
	batch_size=current_config.batch_size,
	lora=current_config.lora,
	lora_targets=current_config.lora_targets,
	)

	# Determine strategy name
	if attempt_num == 1:
	strategy = "Initial attempt"
	else:
	strategy = f"Fallback #{attempt_num-1}"

	# Create attempt record
	attempt = FallbackAttempt(
	attempt_num=attempt_num,
	config=current_config,
	result=result,
	strategy=strategy,
	notes=""
	)
	self.attempts.append(attempt)

	# Add to table
	result_text = "✅ PASS" if result.ok else "❌ FAIL"
	attempts_table.add_row(
	str(attempt_num),
	current_config.precision,
	str(current_config.seq_len),
	str(current_config.batch_size),
	"Yes" if current_config.lora else "No",
	"Yes" if current_config.gradient_checkpointing else "No",
	result_text,
	strategy
	)

	if result.ok:
	console.print(attempts_table)
	console.print(f"\n[bold green]✅ SUCCESS![/bold green] Auto-healing found working configuration at attempt {attempt_num}")
	return True, current_config

	# Classify error and get next fallback
	error_type = self.classify_error(result.error or "unknown")
	next_config = self.apply_fallback_strategy(current_config, error_type)

	if next_config is None:
	console.print(attempts_table)
	console.print(f"\n[bold red]❌ FAILED[/bold red] No more fallback strategies available")
	return False, None

	# Update notes for next attempt
	if error_type == "oom":
	attempt.notes = f"OOM detected, reducing batch size to {next_config.batch_size}"
	elif error_type == "precision":
	attempt.notes = f"Precision {current_config.precision} not supported, switching to {next_config.precision}"
	elif error_type == "seq_len":
	attempt.notes = f"Sequence length {current_config.seq_len} too long, reducing to {next_config.seq_len}"
	elif error_type == "lora":
	attempt.notes = f"LoRA target modules not found, using defaults"

	current_config = next_config

	console.print(attempts_table)
	console.print(f"\n[bold red]❌ FAILED[/bold red] Max attempts ({max_attempts}) reached")
	return False, None

	def generate_yaml_config(self, config: ConfigCandidate) -> str:
	"""Generate YAML-style config block for the working configuration"""
	yaml_lines = [
	"# AUTO-HEALED CONFIG PATCH",
	f"model: {config.model}",
	f"precision: {config.precision}",
	f"seq_len: {config.seq_len}",
	f"batch_size: {config.batch_size}",
	f"lora: {str(config.lora).lower()}",
	f"gradient_checkpointing: {str(config.gradient_checkpointing).lower()}",
	f"dataset: {config.dataset}",
	]

	if config.lora_targets:
	yaml_lines.append(f"lora_targets: {config.lora_targets}")

	if config.text_field:
	yaml_lines.append(f"text_field: {config.text_field}")

	return "\n".join(yaml_lines)