from langchain_community.embeddings.sentence_transformer import ( SentenceTransformerEmbeddings, ) from langchain_community.vectorstores import Chroma import time import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling from trl import SFTTrainer from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training # create the open-source embedding function embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # load Chroma db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db") print("There are", db._collection.count(), " docs in the collection") docs = db._collection.peek(db._collection.count()) dataset = docs['documents'] if torch.cuda.is_available(): print("Cuda is available") base_model_id = "microsoft/phi-2" tokenizer = AutoTokenizer.from_pretrained(base_model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("pad_token was missing and has been set to eos_token") # Configuration to load model in 4-bit quantized bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', #bnb_4bit_compute_dtype='float16', bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=False) model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto") print(model) # Gradient checkpointing to save memory model.gradient_checkpointing_enable() # Freeze base model layers and cast layernorm in fp32 model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) peft_config = LoraConfig( r=64, lora_alpha=64, target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"], bias="none", lora_dropout=0.05, task_type="CAUSAL_LM", ) training_args = TrainingArguments( output_dir='./results', # Output directory for checkpoints and predictions overwrite_output_dir=True, # Overwrite the content of the output directory per_device_train_batch_size=2, # Batch size for training per_device_eval_batch_size=2, # Batch size for evaluation gradient_accumulation_steps=5, # number of steps before optimizing gradient_checkpointing=True, # Enable gradient checkpointing gradient_checkpointing_kwargs={"use_reentrant": False}, warmup_steps=10, # Number of warmup steps #max_steps=1000, # Total number of training steps num_train_epochs=20, # Number of training epochs learning_rate=5e-5, # Learning rate weight_decay=0.01, # Weight decay optim="paged_adamw_8bit", #Keep the optimizer state and quantize it bf16=True, #Use mixed precision training #For logging and saving logging_dir='./logs', logging_strategy="epoch", logging_steps=10, save_strategy="epoch", save_steps=10, save_total_limit=2, # Limit the total number of checkpoints evaluation_strategy="epoch", eval_steps=10, load_best_model_at_end=True, # Load the best model at the end of training lr_scheduler_type="linear", ) def formatting_func(doc): return doc trainer = SFTTrainer( model=model, train_dataset=dataset, eval_dataset=dataset, peft_config=peft_config, args=training_args, max_seq_length=1024, packing=True, formatting_func=formatting_func ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! start_time = time.time() # Record the start time trainer.train() end_time = time.time() # Record the end time training_time = end_time - start_time # Calculate total training time trainer.save_model("./results") print(f"Training completed in {training_time} seconds.")