goingnowhere / train_dataset.py

Add phi-2-super training script

aeddf48 7 months ago

3.92 kB

	from langchain_community.embeddings.sentence_transformer import (
	SentenceTransformerEmbeddings,
	)
	from langchain_community.vectorstores import Chroma

	import time
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
	from trl import SFTTrainer
	from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

	# create the open-source embedding function
	embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

	# load Chroma
	db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db")

	print("There are", db._collection.count(), " docs in the collection")

	docs = db._collection.peek(db._collection.count())
	dataset = docs['documents']

	if torch.cuda.is_available():
	print("Cuda is available")

	base_model_id = "microsoft/phi-2"

	tokenizer = AutoTokenizer.from_pretrained(base_model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	print("pad_token was missing and has been set to eos_token")

	# Configuration to load model in 4-bit quantized
	bnb_config = BitsAndBytesConfig(load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	#bnb_4bit_compute_dtype='float16',
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=False)

	model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto")
	print(model)

	# Gradient checkpointing to save memory
	model.gradient_checkpointing_enable()

	# Freeze base model layers and cast layernorm in fp32
	model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

	peft_config = LoraConfig(
	r=64,
	lora_alpha=64,
	target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"],
	bias="none",
	lora_dropout=0.05,
	task_type="CAUSAL_LM",
	)

	training_args = TrainingArguments(
	output_dir='./results', # Output directory for checkpoints and predictions
	overwrite_output_dir=True, # Overwrite the content of the output directory
	per_device_train_batch_size=2, # Batch size for training
	per_device_eval_batch_size=2, # Batch size for evaluation
	gradient_accumulation_steps=5, # number of steps before optimizing
	gradient_checkpointing=True, # Enable gradient checkpointing
	gradient_checkpointing_kwargs={"use_reentrant": False},
	warmup_steps=10, # Number of warmup steps
	#max_steps=1000, # Total number of training steps
	num_train_epochs=20, # Number of training epochs
	learning_rate=5e-5, # Learning rate
	weight_decay=0.01, # Weight decay
	optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
	bf16=True, #Use mixed precision training
	#For logging and saving
	logging_dir='./logs',
	logging_strategy="epoch",
	logging_steps=10,
	save_strategy="epoch",
	save_steps=10,
	save_total_limit=2, # Limit the total number of checkpoints
	evaluation_strategy="epoch",
	eval_steps=10,
	load_best_model_at_end=True, # Load the best model at the end of training
	lr_scheduler_type="linear",
	)

	def formatting_func(doc):
	return doc

	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	eval_dataset=dataset,
	peft_config=peft_config,
	args=training_args,
	max_seq_length=1024,
	packing=True,
	formatting_func=formatting_func
	)

	model.config.use_cache = False # silence the warnings. Please re-enable for inference!

	start_time = time.time() # Record the start time
	trainer.train()
	end_time = time.time() # Record the end time

	training_time = end_time - start_time # Calculate total training time

	trainer.save_model("./results")
	print(f"Training completed in {training_time} seconds.")