| | """ |
| | Model training script for financial LLM fine-tuning |
| | """ |
| |
|
| | import torch |
| | import json |
| | from datetime import datetime |
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForCausalLM, |
| | TrainingArguments, |
| | Trainer, |
| | DataCollatorForLanguageModeling, |
| | default_data_collator, |
| | BitsAndBytesConfig |
| | ) |
| | from peft import LoraConfig, get_peft_model, TaskType |
| |
|
| |
|
| | def setup_model_and_tokenizer(config): |
| | """Setup model and tokenizer with quantization""" |
| | |
| | |
| | try: |
| | torch.backends.cuda.matmul.allow_tf32 = True |
| | torch.backends.cudnn.allow_tf32 = True |
| | print("โ
TF32 enabled for faster matmul") |
| | except Exception: |
| | pass |
| | |
| | |
| | torch.cuda.empty_cache() |
| | total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 |
| | allocated_memory = torch.cuda.memory_allocated() / 1e9 |
| | free_memory = total_memory - allocated_memory |
| | |
| | print(f"๐ A100 Memory Status:") |
| | print(f" Total: {total_memory:.1f} GB") |
| | print(f" Free: {free_memory:.1f} GB") |
| | |
| | if free_memory < 15: |
| | print("โ ๏ธ Warning: Low GPU memory, consider clearing cache") |
| | |
| | |
| | quantization = config.get("quantization") |
| | if quantization is None: |
| | quantization = "8bit" |
| | print(f"โ๏ธ Quantization mode: {quantization}") |
| |
|
| | |
| | bnb_config = None |
| | if quantization == "4bit": |
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_use_double_quant=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | ) |
| | elif quantization == "8bit": |
| | bnb_config = BitsAndBytesConfig(load_in_8bit=True) |
| | |
| | |
| | print(f"Loading tokenizer: {config['model_name']}") |
| | tokenizer = AutoTokenizer.from_pretrained(config['model_name']) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.padding_side = "right" |
| | |
| | |
| | print(f"Loading model: {config['model_name']}") |
| | model_kwargs = dict( |
| | device_map={"": 0}, |
| | trust_remote_code=True, |
| | torch_dtype=torch.bfloat16, |
| | ) |
| |
|
| | |
| | attn_pref = config.get("attn_impl") |
| | chosen_attn = None |
| | if attn_pref == "flash": |
| | try: |
| | import flash_attn |
| | chosen_attn = "flash_attention_2" |
| | except Exception: |
| | print("โ ๏ธ flash-attn not available; falling back to SDPA") |
| | chosen_attn = "sdpa" |
| | elif attn_pref == "sdpa": |
| | chosen_attn = "sdpa" |
| | elif attn_pref == "eager": |
| | chosen_attn = "eager" |
| | else: |
| | |
| | try: |
| | import flash_attn |
| | chosen_attn = "flash_attention_2" |
| | except Exception: |
| | chosen_attn = "sdpa" |
| |
|
| | |
| | model_kwargs["attn_implementation"] = chosen_attn |
| | print(f"โ
Attention implementation: {chosen_attn}") |
| | if bnb_config is not None: |
| | model_kwargs["quantization_config"] = bnb_config |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | config['model_name'], |
| | **model_kwargs, |
| | ) |
| | |
| | model.config.use_cache = False |
| | model.config.pretraining_tp = 1 |
| | |
| | if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None: |
| | model.config.pad_token_id = tokenizer.pad_token_id |
| | |
| | |
| | try: |
| | if config.get('gradient_checkpointing', True): |
| | model.gradient_checkpointing_enable() |
| | print("โ
Model gradient checkpointing enabled") |
| | except Exception: |
| | pass |
| | |
| | |
| | allocated_after = torch.cuda.memory_allocated() / 1e9 |
| | total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 |
| | usage_percent = (allocated_after / total_memory) * 100 |
| | |
| | print(f"Model loaded successfully!") |
| | print(f"Model parameters: {model.num_parameters():,}") |
| | print(f"๐ GPU Memory after loading: {allocated_after:.1f}/{total_memory:.1f} GB ({usage_percent:.1f}%)") |
| | |
| | if usage_percent > 85: |
| | print("โ ๏ธ Warning: High GPU memory usage! Consider reducing batch size.") |
| | else: |
| | print("โ
GPU memory usage looks good for training!") |
| | |
| | return model, tokenizer |
| |
|
| |
|
| | def setup_lora(model, config): |
| | """Setup LoRA for efficient fine-tuning""" |
| | |
| | |
| | |
| | if "DialoGPT" in config['model_name']: |
| | target_modules = ["c_attn", "c_proj"] |
| | elif "Llama" in config['model_name'] or "llama" in config['model_name']: |
| | |
| | target_modules = [ |
| | "q_proj", "k_proj", "v_proj", "o_proj", |
| | "gate_proj", "up_proj", "down_proj" |
| | ] |
| | else: |
| | |
| | target_modules = ["q_proj", "v_proj"] |
| | |
| | |
| | lora_r = int(config.get('lora_r', 16)) |
| | lora_alpha = int(config.get('lora_alpha', 32)) |
| | lora_dropout = float(config.get('lora_dropout', 0.1)) |
| |
|
| | lora_config = LoraConfig( |
| | task_type=TaskType.CAUSAL_LM, |
| | r=lora_r, |
| | lora_alpha=lora_alpha, |
| | lora_dropout=lora_dropout, |
| | target_modules=target_modules, |
| | bias="none", |
| | ) |
| | |
| | |
| | model = get_peft_model(model, lora_config) |
| | model.print_trainable_parameters() |
| | |
| | print(f"LoRA configuration applied successfully!") |
| | print(f"Target modules: {target_modules}") |
| | print(f"LoRA params โ r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}") |
| | return model |
| |
|
| |
|
| | def tokenize_dataset(dataset, tokenizer, config): |
| | """Tokenize the dataset""" |
| | |
| | def tokenize_function(examples): |
| | """Tokenize the texts""" |
| | |
| | tokenized = tokenizer( |
| | examples["text"], |
| | truncation=True, |
| | padding="max_length", |
| | max_length=config['max_length'], |
| | return_tensors=None, |
| | add_special_tokens=True, |
| | ) |
| | |
| | |
| | |
| | tokenized["labels"] = tokenized["input_ids"].copy() |
| | |
| | return tokenized |
| | |
| | |
| | print("Tokenizing dataset...") |
| | tokenized_dataset = dataset.map( |
| | tokenize_function, |
| | batched=True, |
| | remove_columns=dataset["train"].column_names, |
| | desc="Tokenizing", |
| | ) |
| | |
| | print("Tokenization complete!") |
| | |
| | |
| | sample = tokenized_dataset["train"][0] |
| | print(f"โ
Sample tokenized input_ids shape: {len(sample['input_ids'])}") |
| | print(f"โ
Sample tokenized labels shape: {len(sample['labels'])}") |
| | print(f"โ
Max length setting: {config['max_length']}") |
| | |
| | return tokenized_dataset |
| |
|
| |
|
| | def setup_training(model, tokenizer, tokenized_dataset, config): |
| | """Setup training arguments and trainer""" |
| | |
| | |
| | data_collator = default_data_collator |
| | |
| | import transformers |
| | transformers_version = transformers.__version__ |
| | print(f"๐ง Transformers version: {transformers_version}") |
| | |
| | use_eval_strategy = hasattr(TrainingArguments, '__dataclass_fields__') and \ |
| | 'eval_strategy' in str(TrainingArguments.__dataclass_fields__) |
| | eval_param_name = "eval_strategy" if use_eval_strategy else "evaluation_strategy" |
| | |
| | training_args_dict = { |
| | "output_dir": config['output_dir'], |
| | "per_device_train_batch_size": config['train_batch_size'], |
| | "per_device_eval_batch_size": config['eval_batch_size'], |
| | "gradient_accumulation_steps": config['gradient_accumulation_steps'], |
| | "num_train_epochs": config['num_epochs'], |
| | "learning_rate": config['learning_rate'], |
| | "logging_steps": config.get('logging_steps', 25), |
| | eval_param_name: "steps", |
| | "eval_steps": config.get('eval_steps', 50), |
| | |
| | "save_steps": config.get('save_steps', config.get('eval_steps', 100)), |
| | "save_total_limit": 2, |
| | "remove_unused_columns": False, |
| | "push_to_hub": False, |
| | "report_to": None, |
| | "load_best_model_at_end": True, |
| | "group_by_length": True, |
| | "warmup_ratio": config.get('warmup_ratio', 0.03), |
| | "weight_decay": config.get('weight_decay', 0.01), |
| | "max_grad_norm": config.get('max_grad_norm', 1.0), |
| | "lr_scheduler_type": "cosine", |
| | "dataloader_num_workers": config.get('dataloader_num_workers', 2), |
| | "dataloader_pin_memory": True, |
| | "skip_memory_metrics": True, |
| | "log_level": "warning", |
| | "include_inputs_for_metrics": False, |
| | "prediction_loss_only": True, |
| | "gradient_checkpointing": config.get('gradient_checkpointing', True), |
| | } |
| |
|
| | |
| | |
| | if config.get('align_save_with_eval', True): |
| | training_args_dict["save_steps"] = training_args_dict.get("eval_steps", training_args_dict.get("save_steps", 100)) |
| |
|
| | use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() |
| | if use_bf16: |
| | training_args_dict["bf16"] = True |
| | training_args_dict["fp16"] = False |
| | print("โ
Using bf16 precision") |
| | else: |
| | training_args_dict["fp16"] = True |
| | print("โ
Using fp16 precision") |
| | |
| | print(f"โ
Using {eval_param_name} parameter for evaluation") |
| | training_args = TrainingArguments(**training_args_dict) |
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_dataset["train"], |
| | eval_dataset=tokenized_dataset["validation"], |
| | data_collator=data_collator, |
| | ) |
| | |
| | print("Trainer initialized!") |
| | print(f"Training samples: {len(tokenized_dataset['train'])}") |
| | print(f"Validation samples: {len(tokenized_dataset['validation'])}") |
| | |
| | |
| | print("๐ Validating data shapes...") |
| | train_sample = tokenized_dataset["train"][0] |
| | val_sample = tokenized_dataset["validation"][0] |
| | |
| | print(f"โ
Train sample - input_ids: {len(train_sample['input_ids'])}, labels: {len(train_sample['labels'])}") |
| | print(f"โ
Val sample - input_ids: {len(val_sample['input_ids'])}, labels: {len(val_sample['labels'])}") |
| | |
| | |
| | for i in range(min(3, len(tokenized_dataset['train']))): |
| | sample = tokenized_dataset['train'][i] |
| | if len(sample['input_ids']) != config['max_length']: |
| | print(f"โ ๏ธ Warning: Sample {i} has inconsistent length: {len(sample['input_ids'])} != {config['max_length']}") |
| | if len(sample['input_ids']) != len(sample['labels']): |
| | print(f"โ ๏ธ Warning: Sample {i} input_ids and labels length mismatch: {len(sample['input_ids'])} != {len(sample['labels'])}") |
| | |
| | print("โ
Data validation complete!") |
| | |
| | return trainer |
| |
|
| |
|
| | def save_model_and_config(model, tokenizer, trainer, config): |
| | """Save the trained model and configuration""" |
| | |
| | print("Saving model...") |
| | |
| | |
| | trainer.save_model(config['save_dir']) |
| | tokenizer.save_pretrained(config['save_dir']) |
| | |
| | |
| | config_data = { |
| | "base_model": config['model_name'], |
| | "dataset": config['dataset_name'], |
| | "dataset_config": config['dataset_config'], |
| | "training_config": config, |
| | "lora_config": { |
| | "r": config['lora_r'], |
| | "alpha": config['lora_alpha'], |
| | "dropout": config['lora_dropout'] |
| | }, |
| | "training_date": datetime.now().isoformat() |
| | } |
| | |
| | with open(f"{config['save_dir']}/training_config.json", "w") as f: |
| | json.dump(config_data, f, indent=2, default=str) |
| | |
| | print(f"Model saved to {config['save_dir']}") |
| | |
| | |
| | print("Evaluating model on validation set...") |
| | test_results = trainer.evaluate() |
| | |
| | |
| | with open(f"{config['save_dir']}/test_results.json", "w") as f: |
| | json.dump(test_results, f, indent=2) |
| | |
| | print(f"Evaluation complete! Results saved to {config['save_dir']}/test_results.json") |
| | |
| | return test_results |
| |
|
| |
|
| | def run_training(config, processed_dataset): |
| | """Run the complete training pipeline""" |
| | |
| | print("๐ Starting financial LLM fine-tuning...") |
| | print(f"Base model: {config['model_name']}") |
| | print(f"Dataset: {config['dataset_name']}") |
| | print(f"Training samples: {len(processed_dataset['train'])}") |
| | |
| | |
| | model, tokenizer = setup_model_and_tokenizer(config) |
| | |
| | |
| | model = setup_lora(model, config) |
| | |
| | |
| | tokenized_dataset = tokenize_dataset(processed_dataset, tokenizer, config) |
| | |
| | |
| | trainer = setup_training(model, tokenizer, tokenized_dataset, config) |
| | |
| | |
| | print("Starting training...") |
| | print(f"Training will run for {config['num_epochs']} epochs") |
| | print(f"Effective batch size: {config['train_batch_size'] * config['gradient_accumulation_steps']}") |
| | |
| | trainer.train() |
| | |
| | print("Training completed!") |
| | |
| | |
| | test_results = save_model_and_config(model, tokenizer, trainer, config) |
| | |
| | print("๐ Fine-tuning complete! ๐") |
| | print(f"โ
Model saved to: {config['save_dir']}") |
| | print(f"โ
Test perplexity: {test_results.get('eval_loss', 'N/A'):.4f}") |
| | |
| | return model, tokenizer, trainer |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | test_config = { |
| | "model_name": "microsoft/DialoGPT-medium", |
| | "dataset_name": "Josephgflowers/Finance-Instruct-500k", |
| | "dataset_config": "default", |
| | "max_length": 512, |
| | "train_batch_size": 2, |
| | "eval_batch_size": 2, |
| | "gradient_accumulation_steps": 8, |
| | "learning_rate": 2e-4, |
| | "num_epochs": 1, |
| | "lora_r": 16, |
| | "lora_alpha": 32, |
| | "lora_dropout": 0.1, |
| | "output_dir": "./test-financial-lora", |
| | "save_dir": "./test-financial-final", |
| | "quantization": "8bit", |
| | "save_steps": 100, |
| | "eval_steps": 50, |
| | "logging_steps": 25, |
| | "gradient_checkpointing": True, |
| | "dataloader_num_workers": 2, |
| | } |
| | |
| | print("Testing training pipeline...") |
| | |
| | |
| | |
| | |
| | print("Training pipeline setup complete!") |