| |
| """ |
| BuildwellAI Model V2 - Fine-Tuning Script |
| |
| Optimized for RunPod 2x RTX A5000 (48GB VRAM) with anti-overfitting measures. |
| |
| Key Features: |
| - QLoRA 4-bit quantization for memory efficiency |
| - Validation loss monitoring with early stopping |
| - Learning rate warmup and cosine decay |
| - Weight decay regularization |
| - Gradient clipping |
| - Dropout in LoRA layers |
| - Proper train/val split |
| |
| Usage: |
| python3 finetune.py [--config config.json] |
| """ |
|
|
| import os |
| import sys |
| import json |
| import torch |
| import argparse |
| from pathlib import Path |
| from datetime import datetime |
| from typing import Optional |
|
|
| |
| |
| |
|
|
| DEFAULT_CONFIG = { |
| |
| "base_model": "Qwen/Qwen3-14B", |
| "max_seq_length": 2048, |
|
|
| |
| "lora_r": 16, |
| "lora_alpha": 32, |
| "lora_dropout": 0.1, |
| "lora_target_modules": [ |
| "q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj" |
| ], |
|
|
| |
| "batch_size": 4, |
| "gradient_accumulation_steps": 4, |
| "learning_rate": 1e-5, |
| "num_epochs": 2, |
| "warmup_ratio": 0.1, |
| "weight_decay": 0.05, |
| "max_grad_norm": 0.5, |
|
|
| |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.01, |
|
|
| |
| "eval_steps": 200, |
| "eval_strategy": "steps", |
|
|
| |
| "logging_steps": 50, |
| "save_steps": 200, |
| "save_total_limit": 3, |
|
|
| |
| "train_data": "../datasets/train.jsonl", |
| "val_data": "../datasets/validation.jsonl", |
| "output_dir": "../output/buildwellai-qwen3-14b-v2", |
|
|
| |
| "push_to_hub": False, |
| "hub_model_id": "buildwellai/qwen3-14b-v2", |
| } |
|
|
|
|
| |
| |
| |
|
|
| def setup_environment(): |
| """Setup environment for training.""" |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
| def check_gpu(): |
| """Check GPU availability and memory.""" |
| print("=" * 60) |
| print("GPU Configuration") |
| print("=" * 60) |
|
|
| if not torch.cuda.is_available(): |
| print("ERROR: CUDA not available!") |
| sys.exit(1) |
|
|
| num_gpus = torch.cuda.device_count() |
| total_memory = 0 |
|
|
| for i in range(num_gpus): |
| props = torch.cuda.get_device_properties(i) |
| memory_gb = props.total_memory / (1024**3) |
| total_memory += memory_gb |
| print(f"GPU {i}: {props.name} ({memory_gb:.1f} GB)") |
|
|
| print(f"Total GPUs: {num_gpus}") |
| print(f"Total VRAM: {total_memory:.1f} GB") |
| print(f"PyTorch: {torch.__version__}") |
| print(f"CUDA: {torch.version.cuda}") |
|
|
| return num_gpus |
|
|
|
|
| def load_config(config_path: Optional[str] = None) -> dict: |
| """Load configuration from file or use defaults.""" |
| config = DEFAULT_CONFIG.copy() |
|
|
| if config_path and os.path.exists(config_path): |
| with open(config_path) as f: |
| user_config = json.load(f) |
| config.update(user_config) |
| print(f"Loaded config from: {config_path}") |
|
|
| return config |
|
|
|
|
| def format_chat_example(example: dict, tokenizer) -> str: |
| """Format a training example using chat template.""" |
| messages = example.get("messages", []) |
|
|
| |
| formatted_messages = [] |
| for msg in messages: |
| new_msg = {"role": msg["role"]} |
| content = msg.get("content", "") |
|
|
| |
| if content is None: |
| content = "" |
|
|
| new_msg["content"] = content |
| formatted_messages.append(new_msg) |
|
|
| text = tokenizer.apply_chat_template( |
| formatted_messages, |
| tokenize=False, |
| add_generation_prompt=False |
| ) |
|
|
| if not text.endswith(tokenizer.eos_token): |
| text += tokenizer.eos_token |
|
|
| return text |
|
|
|
|
| |
| |
| |
|
|
| def train_with_unsloth(config: dict): |
| """Train using Unsloth for 2-5x speedup.""" |
| print("\n" + "=" * 60) |
| print("Training with Unsloth (Optimized)") |
| print("=" * 60) |
|
|
| from unsloth import FastLanguageModel, is_bfloat16_supported |
| from unsloth import UnslothTrainer, UnslothTrainingArguments |
| from datasets import load_dataset |
| from transformers import EarlyStoppingCallback |
|
|
| |
| script_dir = Path(__file__).parent |
| train_path = script_dir / config["train_data"] |
| val_path = script_dir / config["val_data"] |
| output_dir = script_dir / config["output_dir"] |
|
|
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| print(f"\nLoading model: {config['base_model']}") |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=config["base_model"], |
| max_seq_length=config["max_seq_length"], |
| dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16, |
| load_in_4bit=True, |
| ) |
|
|
| |
| print("Applying LoRA with dropout...") |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r=config["lora_r"], |
| lora_alpha=config["lora_alpha"], |
| lora_dropout=config["lora_dropout"], |
| target_modules=config["lora_target_modules"], |
| bias="none", |
| use_gradient_checkpointing="unsloth", |
| random_state=42, |
| ) |
|
|
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| total = sum(p.numel() for p in model.parameters()) |
| print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") |
|
|
| |
| print(f"\nLoading training data: {train_path}") |
| train_dataset = load_dataset('json', data_files=str(train_path), split='train') |
| print(f"Training examples: {len(train_dataset):,}") |
|
|
| val_dataset = None |
| if val_path.exists(): |
| print(f"Loading validation data: {val_path}") |
| val_dataset = load_dataset('json', data_files=str(val_path), split='train') |
| print(f"Validation examples: {len(val_dataset):,}") |
|
|
| |
| print("\nFormatting datasets...") |
|
|
| def format_fn(examples): |
| texts = [] |
| for i in range(len(examples["messages"])): |
| example = {"messages": examples["messages"][i]} |
| text = format_chat_example(example, tokenizer) |
| texts.append(text) |
| return {"text": texts} |
|
|
| train_dataset = train_dataset.map( |
| format_fn, |
| batched=True, |
| remove_columns=train_dataset.column_names, |
| desc="Formatting train" |
| ) |
|
|
| if val_dataset: |
| val_dataset = val_dataset.map( |
| format_fn, |
| batched=True, |
| remove_columns=val_dataset.column_names, |
| desc="Formatting validation" |
| ) |
|
|
| |
| effective_batch = config["batch_size"] * config["gradient_accumulation_steps"] * torch.cuda.device_count() |
| print(f"\nEffective batch size: {effective_batch}") |
|
|
| training_args = UnslothTrainingArguments( |
| output_dir=str(output_dir), |
|
|
| |
| num_train_epochs=config["num_epochs"], |
| per_device_train_batch_size=config["batch_size"], |
| per_device_eval_batch_size=config["batch_size"], |
| gradient_accumulation_steps=config["gradient_accumulation_steps"], |
|
|
| |
| learning_rate=config["learning_rate"], |
| lr_scheduler_type="cosine", |
| warmup_ratio=config["warmup_ratio"], |
|
|
| |
| weight_decay=config["weight_decay"], |
| max_grad_norm=config["max_grad_norm"], |
|
|
| |
| eval_strategy=config["eval_strategy"] if val_dataset else "no", |
| eval_steps=config["eval_steps"] if val_dataset else None, |
| load_best_model_at_end=True if val_dataset else False, |
| metric_for_best_model="eval_loss" if val_dataset else None, |
| greater_is_better=False if val_dataset else None, |
|
|
| |
| logging_steps=config["logging_steps"], |
| save_steps=config["save_steps"], |
| save_total_limit=config["save_total_limit"], |
|
|
| |
| optim="adamw_8bit", |
| fp16=not is_bfloat16_supported(), |
| bf16=is_bfloat16_supported(), |
| seed=42, |
| report_to="tensorboard", |
| logging_dir=str(output_dir / "logs"), |
| ) |
|
|
| |
| callbacks = [] |
| if val_dataset: |
| callbacks.append(EarlyStoppingCallback( |
| early_stopping_patience=config["early_stopping_patience"], |
| early_stopping_threshold=config["early_stopping_threshold"] |
| )) |
|
|
| |
| trainer = UnslothTrainer( |
| model=model, |
| tokenizer=tokenizer, |
| train_dataset=train_dataset, |
| eval_dataset=val_dataset, |
| args=training_args, |
| max_seq_length=config["max_seq_length"], |
| dataset_text_field="text", |
| callbacks=callbacks, |
| ) |
|
|
| |
| print("\n" + "=" * 60) |
| print("STARTING TRAINING") |
| print("=" * 60) |
| print(f"Model: {config['base_model']}") |
| print(f"Training examples: {len(train_dataset):,}") |
| print(f"Validation examples: {len(val_dataset) if val_dataset else 0:,}") |
| print(f"Epochs: {config['num_epochs']}") |
| print(f"Batch size: {effective_batch}") |
| print(f"Learning rate: {config['learning_rate']}") |
| print(f"Weight decay: {config['weight_decay']} (regularization)") |
| print(f"LoRA dropout: {config['lora_dropout']} (regularization)") |
| print(f"Early stopping patience: {config['early_stopping_patience']}") |
| print("=" * 60 + "\n") |
|
|
| train_result = trainer.train() |
|
|
| |
| print("\n" + "=" * 60) |
| print("SAVING MODEL") |
| print("=" * 60) |
|
|
| |
| adapter_dir = output_dir / "adapter" |
| model.save_pretrained(str(adapter_dir)) |
| tokenizer.save_pretrained(str(adapter_dir)) |
| print(f"Adapter saved: {adapter_dir}") |
|
|
| |
| merged_dir = output_dir / "merged" |
| try: |
| model.save_pretrained_merged( |
| str(merged_dir), |
| tokenizer, |
| save_method="merged_16bit" |
| ) |
| print(f"Merged model saved: {merged_dir}") |
| except Exception as e: |
| print(f"Warning: Could not save merged model: {e}") |
| merged_dir = None |
|
|
| |
| stats = { |
| "train_loss": train_result.training_loss, |
| "train_runtime": train_result.metrics.get("train_runtime"), |
| "train_samples_per_second": train_result.metrics.get("train_samples_per_second"), |
| "config": config, |
| "completed_at": datetime.now().isoformat(), |
| } |
|
|
| with open(output_dir / "training_stats.json", 'w') as f: |
| json.dump(stats, f, indent=2) |
|
|
| return str(adapter_dir), str(merged_dir) if merged_dir else None |
|
|
|
|
| |
| |
| |
|
|
| def train_with_huggingface(config: dict): |
| """Train using standard HuggingFace (fallback).""" |
| print("\n" + "=" * 60) |
| print("Training with HuggingFace (Standard)") |
| print("=" * 60) |
|
|
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling, |
| BitsAndBytesConfig, |
| EarlyStoppingCallback, |
| ) |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| from datasets import load_dataset |
|
|
| |
| script_dir = Path(__file__).parent |
| train_path = script_dir / config["train_data"] |
| val_path = script_dir / config["val_data"] |
| output_dir = script_dir / config["output_dir"] |
|
|
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| print(f"\nLoading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained( |
| config["base_model"], |
| trust_remote_code=True |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
|
|
| |
| print(f"Loading model: {config['base_model']}") |
| model = AutoModelForCausalLM.from_pretrained( |
| config["base_model"], |
| quantization_config=bnb_config, |
| device_map="auto", |
| trust_remote_code=True, |
| torch_dtype=torch.bfloat16, |
| ) |
|
|
| |
| model = prepare_model_for_kbit_training(model) |
| model.gradient_checkpointing_enable() |
|
|
| |
| print("Applying LoRA...") |
| lora_config = LoraConfig( |
| r=config["lora_r"], |
| lora_alpha=config["lora_alpha"], |
| lora_dropout=config["lora_dropout"], |
| target_modules=config["lora_target_modules"], |
| bias="none", |
| task_type="CAUSAL_LM", |
| ) |
| model = get_peft_model(model, lora_config) |
|
|
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| total = sum(p.numel() for p in model.parameters()) |
| print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") |
|
|
| |
| print(f"\nLoading data...") |
| train_dataset = load_dataset('json', data_files=str(train_path), split='train') |
|
|
| val_dataset = None |
| if val_path.exists(): |
| val_dataset = load_dataset('json', data_files=str(val_path), split='train') |
|
|
| |
| def tokenize_fn(examples): |
| texts = [] |
| for i in range(len(examples["messages"])): |
| example = {"messages": examples["messages"][i]} |
| text = format_chat_example(example, tokenizer) |
| texts.append(text) |
|
|
| tokenized = tokenizer( |
| texts, |
| truncation=True, |
| max_length=config["max_seq_length"], |
| padding=False, |
| ) |
| return tokenized |
|
|
| train_dataset = train_dataset.map( |
| tokenize_fn, |
| batched=True, |
| remove_columns=train_dataset.column_names, |
| desc="Tokenizing train" |
| ) |
|
|
| if val_dataset: |
| val_dataset = val_dataset.map( |
| tokenize_fn, |
| batched=True, |
| remove_columns=val_dataset.column_names, |
| desc="Tokenizing validation" |
| ) |
|
|
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=str(output_dir), |
|
|
| |
| num_train_epochs=config["num_epochs"], |
| per_device_train_batch_size=config["batch_size"], |
| per_device_eval_batch_size=config["batch_size"], |
| gradient_accumulation_steps=config["gradient_accumulation_steps"], |
|
|
| |
| learning_rate=config["learning_rate"], |
| lr_scheduler_type="cosine", |
| warmup_ratio=config["warmup_ratio"], |
|
|
| |
| weight_decay=config["weight_decay"], |
| max_grad_norm=config["max_grad_norm"], |
|
|
| |
| eval_strategy=config["eval_strategy"] if val_dataset else "no", |
| eval_steps=config["eval_steps"] if val_dataset else None, |
| load_best_model_at_end=True if val_dataset else False, |
| metric_for_best_model="eval_loss" if val_dataset else None, |
|
|
| |
| logging_steps=config["logging_steps"], |
| save_steps=config["save_steps"], |
| save_total_limit=config["save_total_limit"], |
|
|
| |
| bf16=True, |
| optim="adamw_8bit", |
| gradient_checkpointing=True, |
| group_by_length=True, |
| report_to="tensorboard", |
| logging_dir=str(output_dir / "logs"), |
| dataloader_pin_memory=False, |
| ) |
|
|
| |
| callbacks = [] |
| if val_dataset: |
| callbacks.append(EarlyStoppingCallback( |
| early_stopping_patience=config["early_stopping_patience"], |
| early_stopping_threshold=config["early_stopping_threshold"] |
| )) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=val_dataset, |
| data_collator=data_collator, |
| callbacks=callbacks, |
| ) |
|
|
| |
| print("\n" + "=" * 60) |
| print("STARTING TRAINING") |
| print("=" * 60) |
|
|
| train_result = trainer.train() |
|
|
| |
| print("\n" + "=" * 60) |
| print("SAVING MODEL") |
| print("=" * 60) |
|
|
| adapter_dir = output_dir / "adapter" |
| model.save_pretrained(str(adapter_dir)) |
| tokenizer.save_pretrained(str(adapter_dir)) |
| print(f"Adapter saved: {adapter_dir}") |
|
|
| return str(adapter_dir), None |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="BuildwellAI Model V2 Fine-Tuning") |
| parser.add_argument("--config", type=str, help="Path to config JSON file") |
| args = parser.parse_args() |
|
|
| print("=" * 60) |
| print("BuildwellAI Model V2 - Fine-Tuning") |
| print("=" * 60) |
| print(f"Started: {datetime.now().isoformat()}") |
|
|
| |
| setup_environment() |
| num_gpus = check_gpu() |
|
|
| |
| config = load_config(args.config) |
|
|
| |
| print("\n" + "=" * 60) |
| print("Configuration") |
| print("=" * 60) |
| for key, value in config.items(): |
| if not key.startswith("lora_target"): |
| print(f" {key}: {value}") |
|
|
| |
| script_dir = Path(__file__).parent |
| train_path = script_dir / config["train_data"] |
|
|
| if not train_path.exists(): |
| print(f"\nERROR: Training data not found: {train_path}") |
| print("Run prepare_dataset.py first!") |
| sys.exit(1) |
|
|
| |
| try: |
| from unsloth import FastLanguageModel |
| print("\nUnsloth available - using optimized training") |
| adapter_dir, merged_dir = train_with_unsloth(config) |
| except ImportError: |
| print("\nUnsloth not available - using HuggingFace") |
| adapter_dir, merged_dir = train_with_huggingface(config) |
|
|
| |
| print("\n" + "=" * 60) |
| print("TRAINING COMPLETE!") |
| print("=" * 60) |
| print(f"\nModel saved to:") |
| print(f" Adapter: {adapter_dir}") |
| if merged_dir: |
| print(f" Merged: {merged_dir}") |
|
|
| print(f"\nNext steps:") |
| print(f" 1. Test: python3 streaming_api.py --model {merged_dir or adapter_dir}") |
| print(f" 2. Deploy to production") |
|
|
| print(f"\nCompleted: {datetime.now().isoformat()}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|