Upload train_model.py with huggingface_hub

5106722 verified 7 months ago

15.5 kB

	"""
	Model training script for financial LLM fine-tuning
	"""

	import torch
	import json
	from datetime import datetime
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling,
	default_data_collator,
	BitsAndBytesConfig
	)
	from peft import LoraConfig, get_peft_model, TaskType


	def setup_model_and_tokenizer(config):
	"""Setup model and tokenizer with quantization"""

	# Speed: enable TF32 on Ampere (A100)
	try:
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	print("✅ TF32 enabled for faster matmul")
	except Exception:
	pass

	# Clear GPU cache and check memory
	torch.cuda.empty_cache()
	total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
	allocated_memory = torch.cuda.memory_allocated() / 1e9
	free_memory = total_memory - allocated_memory

	print(f"🔋 A100 Memory Status:")
	print(f" Total: {total_memory:.1f} GB")
	print(f" Free: {free_memory:.1f} GB")

	if free_memory < 15:
	print("⚠️ Warning: Low GPU memory, consider clearing cache")

	# Determine quantization mode (default to 8bit to avoid OOM)
	quantization = config.get("quantization")
	if quantization is None:
	quantization = "8bit"
	print(f"⚙️ Quantization mode: {quantization}")

	# Quantization config
	bnb_config = None
	if quantization == "4bit":
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	elif quantization == "8bit":
	bnb_config = BitsAndBytesConfig(load_in_8bit=True)

	# Load tokenizer
	print(f"Loading tokenizer: {config['model_name']}")
	tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right" # Ensure consistent padding

	# Load model
	print(f"Loading model: {config['model_name']}")
	model_kwargs = dict(
	device_map={"": 0}, # Force all layers to GPU 0
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	)

	# Attention implementation selection: try flash-attn v2, else SDPA, else eager
	attn_pref = config.get("attn_impl") # "flash" \| "sdpa" \| "eager" \| None
	chosen_attn = None
	if attn_pref == "flash":
	try:
	import flash_attn # noqa: F401
	chosen_attn = "flash_attention_2"
	except Exception:
	print("⚠️ flash-attn not available; falling back to SDPA")
	chosen_attn = "sdpa"
	elif attn_pref == "sdpa":
	chosen_attn = "sdpa"
	elif attn_pref == "eager":
	chosen_attn = "eager"
	else:
	# Auto: prefer flash if importable, otherwise SDPA
	try:
	import flash_attn # noqa: F401
	chosen_attn = "flash_attention_2"
	except Exception:
	chosen_attn = "sdpa"

	# Pass down to Transformers if supported (>=4.39 for Llama)
	model_kwargs["attn_implementation"] = chosen_attn
	print(f"✅ Attention implementation: {chosen_attn}")
	if bnb_config is not None:
	model_kwargs["quantization_config"] = bnb_config

	model = AutoModelForCausalLM.from_pretrained(
	config['model_name'],
	**model_kwargs,
	)

	model.config.use_cache = False
	model.config.pretraining_tp = 1
	# Ensure pad token id is set for training/eval
	if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None:
	model.config.pad_token_id = tokenizer.pad_token_id

	# Enable gradient checkpointing on the model to reduce memory
	try:
	if config.get('gradient_checkpointing', True):
	model.gradient_checkpointing_enable()
	print("✅ Model gradient checkpointing enabled")
	except Exception:
	pass

	# Check memory usage after model loading
	allocated_after = torch.cuda.memory_allocated() / 1e9
	total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
	usage_percent = (allocated_after / total_memory) * 100

	print(f"Model loaded successfully!")
	print(f"Model parameters: {model.num_parameters():,}")
	print(f"🔋 GPU Memory after loading: {allocated_after:.1f}/{total_memory:.1f} GB ({usage_percent:.1f}%)")

	if usage_percent > 85:
	print("⚠️ Warning: High GPU memory usage! Consider reducing batch size.")
	else:
	print("✅ GPU memory usage looks good for training!")

	return model, tokenizer


	def setup_lora(model, config):
	"""Setup LoRA for efficient fine-tuning"""

	# LoRA configuration
	# Determine target modules based on model architecture
	if "DialoGPT" in config['model_name']:
	target_modules = ["c_attn", "c_proj"]
	elif "Llama" in config['model_name'] or "llama" in config['model_name']:
	# Llama 3.1 architecture - target all attention and MLP layers
	target_modules = [
	"q_proj", "k_proj", "v_proj", "o_proj", # Attention layers
	"gate_proj", "up_proj", "down_proj" # MLP layers
	]
	else:
	# Default for other transformer models
	target_modules = ["q_proj", "v_proj"]

	# Read LoRA hyperparameters with safe defaults
	lora_r = int(config.get('lora_r', 16))
	lora_alpha = int(config.get('lora_alpha', 32))
	lora_dropout = float(config.get('lora_dropout', 0.1))

	lora_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	r=lora_r,
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	target_modules=target_modules,
	bias="none",
	)

	# Apply LoRA to model
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	print(f"LoRA configuration applied successfully!")
	print(f"Target modules: {target_modules}")
	print(f"LoRA params → r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
	return model


	def tokenize_dataset(dataset, tokenizer, config):
	"""Tokenize the dataset"""

	def tokenize_function(examples):
	"""Tokenize the texts"""
	# Tokenize with consistent padding and truncation
	tokenized = tokenizer(
	examples["text"],
	truncation=True,
	padding="max_length",
	max_length=config['max_length'],
	return_tensors=None,
	add_special_tokens=True,
	)

	# Set labels (for causal LM, labels = input_ids)
	# Make sure labels are exactly the same as input_ids
	tokenized["labels"] = tokenized["input_ids"].copy()

	return tokenized

	# Apply tokenization
	print("Tokenizing dataset...")
	tokenized_dataset = dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=dataset["train"].column_names,
	desc="Tokenizing",
	)

	print("Tokenization complete!")

	# Debug: Check tokenized sample shapes
	sample = tokenized_dataset["train"][0]
	print(f"✅ Sample tokenized input_ids shape: {len(sample['input_ids'])}")
	print(f"✅ Sample tokenized labels shape: {len(sample['labels'])}")
	print(f"✅ Max length setting: {config['max_length']}")

	return tokenized_dataset


	def setup_training(model, tokenizer, tokenized_dataset, config):
	"""Setup training arguments and trainer"""

	# Use default data collator since we're pre-padding during tokenization
	data_collator = default_data_collator

	import transformers
	transformers_version = transformers.__version__
	print(f"🔧 Transformers version: {transformers_version}")

	use_eval_strategy = hasattr(TrainingArguments, '__dataclass_fields__') and \
	'eval_strategy' in str(TrainingArguments.__dataclass_fields__)
	eval_param_name = "eval_strategy" if use_eval_strategy else "evaluation_strategy"

	training_args_dict = {
	"output_dir": config['output_dir'],
	"per_device_train_batch_size": config['train_batch_size'],
	"per_device_eval_batch_size": config['eval_batch_size'],
	"gradient_accumulation_steps": config['gradient_accumulation_steps'],
	"num_train_epochs": config['num_epochs'],
	"learning_rate": config['learning_rate'],
	"logging_steps": config.get('logging_steps', 25),
	eval_param_name: "steps",
	"eval_steps": config.get('eval_steps', 50),
	# Save checkpoints frequently enough; default aligns with eval steps
	"save_steps": config.get('save_steps', config.get('eval_steps', 100)),
	"save_total_limit": 2,
	"remove_unused_columns": False,
	"push_to_hub": False,
	"report_to": None,
	"load_best_model_at_end": True,
	"group_by_length": True,
	"warmup_ratio": config.get('warmup_ratio', 0.03),
	"weight_decay": config.get('weight_decay', 0.01),
	"max_grad_norm": config.get('max_grad_norm', 1.0),
	"lr_scheduler_type": "cosine",
	"dataloader_num_workers": config.get('dataloader_num_workers', 2),
	"dataloader_pin_memory": True,
	"skip_memory_metrics": True,
	"log_level": "warning",
	"include_inputs_for_metrics": False,
	"prediction_loss_only": True,
	"gradient_checkpointing": config.get('gradient_checkpointing', True),
	}

	# Optionally force alignment so a checkpoint is always written at each eval step
	# This helps ensure the current best (by eval loss) has a corresponding checkpoint
	if config.get('align_save_with_eval', True):
	training_args_dict["save_steps"] = training_args_dict.get("eval_steps", training_args_dict.get("save_steps", 100))

	use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
	if use_bf16:
	training_args_dict["bf16"] = True
	training_args_dict["fp16"] = False
	print("✅ Using bf16 precision")
	else:
	training_args_dict["fp16"] = True
	print("✅ Using fp16 precision")

	print(f"✅ Using {eval_param_name} parameter for evaluation")
	training_args = TrainingArguments(**training_args_dict)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["validation"],
	data_collator=data_collator,
	)

	print("Trainer initialized!")
	print(f"Training samples: {len(tokenized_dataset['train'])}")
	print(f"Validation samples: {len(tokenized_dataset['validation'])}")

	# Validate data shapes to prevent tensor errors
	print("🔍 Validating data shapes...")
	train_sample = tokenized_dataset["train"][0]
	val_sample = tokenized_dataset["validation"][0]

	print(f"✅ Train sample - input_ids: {len(train_sample['input_ids'])}, labels: {len(train_sample['labels'])}")
	print(f"✅ Val sample - input_ids: {len(val_sample['input_ids'])}, labels: {len(val_sample['labels'])}")

	# Check a few more samples to ensure consistency
	for i in range(min(3, len(tokenized_dataset['train']))):
	sample = tokenized_dataset['train'][i]
	if len(sample['input_ids']) != config['max_length']:
	print(f"⚠️ Warning: Sample {i} has inconsistent length: {len(sample['input_ids'])} != {config['max_length']}")
	if len(sample['input_ids']) != len(sample['labels']):
	print(f"⚠️ Warning: Sample {i} input_ids and labels length mismatch: {len(sample['input_ids'])} != {len(sample['labels'])}")

	print("✅ Data validation complete!")

	return trainer


	def save_model_and_config(model, tokenizer, trainer, config):
	"""Save the trained model and configuration"""

	print("Saving model...")

	# Save LoRA adapter
	trainer.save_model(config['save_dir'])
	tokenizer.save_pretrained(config['save_dir'])

	# Save configuration
	config_data = {
	"base_model": config['model_name'],
	"dataset": config['dataset_name'],
	"dataset_config": config['dataset_config'],
	"training_config": config,
	"lora_config": {
	"r": config['lora_r'],
	"alpha": config['lora_alpha'],
	"dropout": config['lora_dropout']
	},
	"training_date": datetime.now().isoformat()
	}

	with open(f"{config['save_dir']}/training_config.json", "w") as f:
	json.dump(config_data, f, indent=2, default=str)

	print(f"Model saved to {config['save_dir']}")

	# Evaluate on validation set
	print("Evaluating model on validation set...")
	test_results = trainer.evaluate()

	# Save evaluation results
	with open(f"{config['save_dir']}/test_results.json", "w") as f:
	json.dump(test_results, f, indent=2)

	print(f"Evaluation complete! Results saved to {config['save_dir']}/test_results.json")

	return test_results


	def run_training(config, processed_dataset):
	"""Run the complete training pipeline"""

	print("🚀 Starting financial LLM fine-tuning...")
	print(f"Base model: {config['model_name']}")
	print(f"Dataset: {config['dataset_name']}")
	print(f"Training samples: {len(processed_dataset['train'])}")

	# Setup model and tokenizer
	model, tokenizer = setup_model_and_tokenizer(config)

	# Apply LoRA
	model = setup_lora(model, config)

	# Tokenize dataset
	tokenized_dataset = tokenize_dataset(processed_dataset, tokenizer, config)

	# Setup training
	trainer = setup_training(model, tokenizer, tokenized_dataset, config)

	# Start training
	print("Starting training...")
	print(f"Training will run for {config['num_epochs']} epochs")
	print(f"Effective batch size: {config['train_batch_size'] * config['gradient_accumulation_steps']}")

	trainer.train()

	print("Training completed!")

	# Save model and evaluate
	test_results = save_model_and_config(model, tokenizer, trainer, config)

	print("🎉 Fine-tuning complete! 🎉")
	print(f"✅ Model saved to: {config['save_dir']}")
	print(f"✅ Test perplexity: {test_results.get('eval_loss', 'N/A'):.4f}")

	return model, tokenizer, trainer


	if __name__ == "__main__":
	# Test configuration
	test_config = {
	"model_name": "microsoft/DialoGPT-medium",
	"dataset_name": "Josephgflowers/Finance-Instruct-500k",
	"dataset_config": "default",
	"max_length": 512,
	"train_batch_size": 2,
	"eval_batch_size": 2,
	"gradient_accumulation_steps": 8,
	"learning_rate": 2e-4,
	"num_epochs": 1,
	"lora_r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.1,
	"output_dir": "./test-financial-lora",
	"save_dir": "./test-financial-final",
	"quantization": "8bit", # options: none \| 8bit \| 4bit
	"save_steps": 100,
	"eval_steps": 50,
	"logging_steps": 25,
	"gradient_checkpointing": True,
	"dataloader_num_workers": 2, # Added for testing
	}

	print("Testing training pipeline...")

	# This would require the processed dataset
	# model, tokenizer, trainer = run_training(test_config, processed_dataset)

	print("Training pipeline setup complete!")