import os import gc import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, ) from peft import LoraConfig, PeftModel from trl import SFTTrainer # The model that you want to train from the Hugging Face hub model_name = "NousResearch/Llama-2-7b-chat-hf" # The instruction dataset to use dataset_name = "2nji/makebelieve-480" # Fine-tuned model name new_model = "makebelieve" ################################################################################ # QLoRA parameters ################################################################################ # LoRA attention dimension lora_r = 64 # Alpha parameter for LoRA scaling lora_alpha = 16 # Dropout probability for LoRA layers lora_dropout = 0.1 ################################################################################ # bitsandbytes parameters ################################################################################ # Activate 4-bit precision base model loading use_4bit = True # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4) bnb_4bit_quant_type = "nf4" # Activate nested quantization for 4-bit base models (double quantization) use_nested_quant = False ################################################################################ # TrainingArguments parameters ################################################################################ # Output directory where the model predictions and checkpoints will be stored output_dir = "./results" # Number of training epochs num_train_epochs = 1 # Enable fp16/bf16 training (set bf16 to True with an A100) fp16 = False bf16 = False # Batch size per GPU for training per_device_train_batch_size = 1 # Batch size per GPU for evaluation per_device_eval_batch_size = 1 # Number of update steps to accumulate the gradients for gradient_accumulation_steps = 1 # Enable gradient checkpointing gradient_checkpointing = True # Maximum gradient normal (gradient clipping) max_grad_norm = 0.3 # Initial learning rate (AdamW optimizer) learning_rate = 2e-4 # Weight decay to apply to all layers except bias/LayerNorm weights weight_decay = 0.001 # Optimizer to use optim = "paged_adamw_32bit" # Learning rate schedule lr_scheduler_type = "cosine" # Number of training steps (overrides num_train_epochs) max_steps = -1 # Ratio of steps for a linear warmup (from 0 to learning rate) warmup_ratio = 0.03 # Group sequences into batches with same length # Saves memory and speeds up training considerably group_by_length = True # Save checkpoint every X updates steps save_steps = 0 # Log every X updates steps logging_steps = 25 ################################################################################ # SFT parameters ################################################################################ # Maximum sequence length to use max_seq_length = None # Pack multiple short examples in the same input sequence to increase efficiency packing = False # Load the entire model on the GPU 0 # device_map = {"": 0} device_map = "auto" # Load dataset (you can process it here) dataset = load_dataset(dataset_name, split="train") # Load tokenizer and model with QLoRA configuration compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) # Check GPU compatibility with bfloat16 if compute_dtype == torch.float16 and use_4bit: major, _ = torch.cuda.get_device_capability() if major >= 8: print("=" * 80) print("Your GPU supports bfloat16: accelerate training with bf16=True") print("=" * 80) # Load base model model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map=device_map ) model.config.use_cache = False model.config.pretraining_tp = 1 # Load LLaMA tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training # Load LoRA configuration peft_config = LoraConfig( lora_alpha=lora_alpha, lora_dropout=lora_dropout, r=lora_r, bias="none", task_type="CAUSAL_LM", ) # Set training parameters training_arguments = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, optim=optim, save_steps=save_steps, logging_steps=logging_steps, learning_rate=learning_rate, weight_decay=weight_decay, fp16=fp16, bf16=bf16, max_grad_norm=max_grad_norm, max_steps=max_steps, warmup_ratio=warmup_ratio, group_by_length=group_by_length, lr_scheduler_type=lr_scheduler_type, report_to="tensorboard" ) # Set supervised fine-tuning parameters trainer = SFTTrainer( model=model, train_dataset=dataset, peft_config=peft_config, dataset_text_field="text", max_seq_length=max_seq_length, tokenizer=tokenizer, args=training_arguments, packing=packing, ) # Train model trainer.train() # Save trained model trainer.model.save_pretrained(new_model) # %load_ext tensorboard # %tensorboard --logdir results/runs # Ignore warnings logging.set_verbosity(logging.CRITICAL) # Run text generation pipeline with our next model prompt = "What did taylor swift do?" pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) result = pipe(f"[INST] {prompt} [/INST]") print(result[0]['generated_text']) # Empty VRAM del model del pipe del trainer gc.collect() gc.collect() # Reload model in FP16 and merge it with LoRA weights base_model = AutoModelForCausalLM.from_pretrained( model_name, low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, device_map=device_map, ) model = PeftModel.from_pretrained(base_model, new_model) model = model.merge_and_unload() # Reload tokenizer to save it tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Save model and tokenizer # Do not forget to add your token here model.push_to_hub(new_model, use_temp_dir=False, token="...") tokenizer.push_to_hub(new_model, use_temp_dir=False, token="...") print("end of makebelieve.py")