import os import torch #This is the script used to finetune the scikit-llm model. #It also contains all the hyperparameters used for training and should be fully reproducible. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, LlamaTokenizerFast ) from peft import LoraConfig, PeftModel, get_peft_model from trl import SFTTrainer # We use a previously finetuned model of Mistral, Mistral-Hermes. #It already includes many instruction-based features (including the chatml syntax) that makes it easier to finetune. model_name = "mistral-hermes-2.5" torch.cuda.empty_cache() # The name of the model. new_model_name = "mistral-skikit-reference" # The output directory where the model predictions and checkpoints will be written output_dir = "./mistral-skikit-reference" # Tensorboard logs tb_log_dir = "./mistral-skikit-reference/logs" # The number of steps. Since we chose a lower learning rate, we took on a long training (8 epochs). Could be lower. max_steps = 1200 # Les paramètres importants !! per_device_train_batch_size = 4 #Number of batches to send per step. Optimal given our GPU vram. learning_rate = 2e-5 #The most important hyperparmater. We take a lower value as mistral-hermes is already finetuned and we want to keep the capacities. max_seq_length = 4096 #Context window length. Here we are constrained by Hermes, but Mistral is up to 8128 (32k in the new version) save_steps = 1000 # Automated saving of the steps. lr_scheduler_type = "linear" #Learning rate scheduler. Better to decrease the learning rate for long training. I prefer linear over to cosine as it is more predictable: easier to restart training if needed. #Other parameters. I don't usually tweak thoses. local_rank = -1 per_device_eval_batch_size = 1 gradient_accumulation_steps = 4 max_grad_norm = 0.3 weight_decay = 0.001 lora_alpha = 16 lora_dropout = 0.1 lora_r = 64 # Group sequences into batches with same length (saves memory and speeds up training considerably) group_by_length = True # Activate 4-bit precision base model loading #We go back to 16-bit for inference. #Currently this speeds up training significantly we nearly no quality impact. use_4bit = True # Activate nested quantization for 4-bit base models use_nested_quant = False # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4= bnb_4bit_quant_type = "nf4" # Number of training epochs #(not used in practice) num_train_epochs = 1 # Enable fp16 training fp16 = True # Enable bf16 training bf16 = False # Use packing dataset creating packing = False # Enable gradient checkpointing gradient_checkpointing = True # Optimizer to use, original is paged_adamw_32bit optim = "paged_adamw_32bit" # Fraction of steps to do a warmup for warmup_ratio = 0.03 # Log every X updates steps logging_steps = 1 # Load the entire model on the GPU 0 device_map = {"": 0} # Visualize training report_to = "tensorboard" #2. Loading the tokenizer peft_config = LoraConfig( lora_alpha=lora_alpha, lora_dropout=lora_dropout, r=lora_r, inference_mode=False, task_type="CAUSAL_LM", target_modules = ["q_proj", "v_proj"] ) tokenizer = AutoTokenizer.from_pretrained(model_name) # This is the fix for fp16 training tokenizer.pad_token = tokenizer.eos_token #3. Preparing the dataset. #This is the part most specific to the scikit model. #We take an entire conversation, as both the input and the output are part of the same string of texts. from datasets import load_dataset def format_alpaca(sample): prompt = f"{sample['conversation']}" return prompt # template dataset to add prompt to each sample def template_dataset(sample): sample["text"] = f"{format_alpaca(sample)}{tokenizer.eos_token}" return sample # Loading the data du dataset. data_files = {"train": "skikit_administration.json"} dataset = load_dataset("json", data_files=data_files, split="train") # Shuffle the dataset dataset_shuffled = dataset.shuffle(seed=42) #Dataset parsing. dataset = dataset.map(template_dataset, remove_columns=list(dataset.features)) print(dataset[40]) #4. Model import # Load tokenizer and model with QLoRA configuration compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) if compute_dtype == torch.float16 and use_4bit: major, _ = torch.cuda.get_device_capability() if major >= 8: print("=" * 80) print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") print("=" * 80) model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device_map, quantization_config=bnb_config ) model.config.use_cache = False model.config.pretraining_tp = 1 #5. Fine-tuning (actually) #We pass all the hyperparmeters, and are ready to go. torch.cuda.empty_cache() training_arguments = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, gradient_checkpointing=True, optim=optim, save_steps=save_steps, logging_steps=logging_steps, learning_rate=learning_rate, fp16=fp16, bf16=bf16, max_grad_norm=max_grad_norm, max_steps=max_steps, warmup_ratio=warmup_ratio, group_by_length=group_by_length, lr_scheduler_type=lr_scheduler_type, report_to="tensorboard" ) trainer = SFTTrainer( model=model, train_dataset=dataset, peft_config=peft_config, dataset_text_field="text", max_seq_length=max_seq_length, tokenizer=tokenizer, args=training_arguments, packing=packing ) #Training: trainer.train() #Optionally, if we want to continue training (for instance if there was an issue): #trainer.train(resume_from_checkpoint=True) #6. Export the weights model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training model_to_save.save_pretrained(new_model_name) torch.cuda.empty_cache() from peft import AutoPeftModelForCausalLM model = AutoPeftModelForCausalLM.from_pretrained(new_model_name, device_map="auto", torch_dtype=torch.bfloat16) model = model.merge_and_unload() output_merged_dir = os.path.join(new_model_name, new_model_name) model.save_pretrained(output_merged_dir, safe_serialization=True) #We also save the tokenizer tokenizer.save_pretrained(output_merged_dir)