ScikitLLM-Model / finetune_scikitllm.py

Upload finetune_scikitllm.py

4b866bc verified 3 months ago

No virus

6.81 kB

	import os
	import torch

	#This is the script used to finetune the scikit-llm model.
	#It also contains all the hyperparameters used for training and should be fully reproducible.

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	print(device)


	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	LlamaTokenizerFast
	)
	from peft import LoraConfig, PeftModel, get_peft_model
	from trl import SFTTrainer

	# We use a previously finetuned model of Mistral, Mistral-Hermes.
	#It already includes many instruction-based features (including the chatml syntax) that makes it easier to finetune.
	model_name = "mistral-hermes-2.5"

	torch.cuda.empty_cache()

	# The name of the model.
	new_model_name = "mistral-skikit-reference"

	# The output directory where the model predictions and checkpoints will be written
	output_dir = "./mistral-skikit-reference"

	# Tensorboard logs
	tb_log_dir = "./mistral-skikit-reference/logs"

	# The number of steps. Since we chose a lower learning rate, we took on a long training (8 epochs). Could be lower.
	max_steps = 1200

	# Les paramètres importants !!
	per_device_train_batch_size = 4 #Number of batches to send per step. Optimal given our GPU vram.
	learning_rate = 2e-5 #The most important hyperparmater. We take a lower value as mistral-hermes is already finetuned and we want to keep the capacities.
	max_seq_length = 4096 #Context window length. Here we are constrained by Hermes, but Mistral is up to 8128 (32k in the new version)
	save_steps = 1000 # Automated saving of the steps.
	lr_scheduler_type = "linear" #Learning rate scheduler. Better to decrease the learning rate for long training. I prefer linear over to cosine as it is more predictable: easier to restart training if needed.


	#Other parameters. I don't usually tweak thoses.
	local_rank = -1
	per_device_eval_batch_size = 1
	gradient_accumulation_steps = 4
	max_grad_norm = 0.3
	weight_decay = 0.001
	lora_alpha = 16
	lora_dropout = 0.1
	lora_r = 64

	# Group sequences into batches with same length (saves memory and speeds up training considerably)
	group_by_length = True

	# Activate 4-bit precision base model loading
	#We go back to 16-bit for inference.
	#Currently this speeds up training significantly we nearly no quality impact.
	use_4bit = True

	# Activate nested quantization for 4-bit base models
	use_nested_quant = False

	# Compute dtype for 4-bit base models
	bnb_4bit_compute_dtype = "float16"

	# Quantization type (fp4 or nf4=
	bnb_4bit_quant_type = "nf4"

	# Number of training epochs
	#(not used in practice)
	num_train_epochs = 1

	# Enable fp16 training
	fp16 = True

	# Enable bf16 training
	bf16 = False

	# Use packing dataset creating
	packing = False

	# Enable gradient checkpointing
	gradient_checkpointing = True

	# Optimizer to use, original is paged_adamw_32bit
	optim = "paged_adamw_32bit"

	# Fraction of steps to do a warmup for
	warmup_ratio = 0.03

	# Log every X updates steps
	logging_steps = 1

	# Load the entire model on the GPU 0
	device_map = {"": 0}

	# Visualize training
	report_to = "tensorboard"


	#2. Loading the tokenizer
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	inference_mode=False,
	task_type="CAUSAL_LM",
	target_modules = ["q_proj", "v_proj"]
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# This is the fix for fp16 training
	tokenizer.pad_token = tokenizer.eos_token

	#3. Preparing the dataset.
	#This is the part most specific to the scikit model.
	#We take an entire conversation, as both the input and the output are part of the same string of texts.
	from datasets import load_dataset

	def format_alpaca(sample):
	prompt = f"{sample['conversation']}"
	return prompt

	# template dataset to add prompt to each sample
	def template_dataset(sample):
	sample["text"] = f"{format_alpaca(sample)}{tokenizer.eos_token}"
	return sample

	# Loading the data du dataset.
	data_files = {"train": "skikit_administration.json"}
	dataset = load_dataset("json", data_files=data_files, split="train")

	# Shuffle the dataset
	dataset_shuffled = dataset.shuffle(seed=42)

	#Dataset parsing.
	dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))

	print(dataset[40])

	#4. Model import

	# Load tokenizer and model with QLoRA configuration
	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	if compute_dtype == torch.float16 and use_4bit:
	major, _ = torch.cuda.get_device_capability()
	if major >= 8:
	print("=" * 80)
	print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
	print("=" * 80)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map=device_map,
	quantization_config=bnb_config
	)

	model.config.use_cache = False
	model.config.pretraining_tp = 1

	#5. Fine-tuning (actually)
	#We pass all the hyperparmeters, and are ready to go.

	torch.cuda.empty_cache()

	training_arguments = TrainingArguments(
	output_dir=output_dir,
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	gradient_checkpointing=True,
	optim=optim,
	save_steps=save_steps,
	logging_steps=logging_steps,
	learning_rate=learning_rate,
	fp16=fp16,
	bf16=bf16,
	max_grad_norm=max_grad_norm,
	max_steps=max_steps,
	warmup_ratio=warmup_ratio,
	group_by_length=group_by_length,
	lr_scheduler_type=lr_scheduler_type,
	report_to="tensorboard"
	)

	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	peft_config=peft_config,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	tokenizer=tokenizer,
	args=training_arguments,
	packing=packing
	)

	#Training:
	trainer.train()

	#Optionally, if we want to continue training (for instance if there was an issue):
	#trainer.train(resume_from_checkpoint=True)

	#6. Export the weights
	model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training
	model_to_save.save_pretrained(new_model_name)

	torch.cuda.empty_cache()

	from peft import AutoPeftModelForCausalLM

	model = AutoPeftModelForCausalLM.from_pretrained(new_model_name, device_map="auto", torch_dtype=torch.bfloat16)
	model = model.merge_and_unload()

	output_merged_dir = os.path.join(new_model_name, new_model_name)
	model.save_pretrained(output_merged_dir, safe_serialization=True)

	#We also save the tokenizer
	tokenizer.save_pretrained(output_merged_dir)