Spaces:
Sleeping
Sleeping
import torch | |
import os | |
import locale | |
import math | |
import mlflow | |
import pandas as pd | |
from trl import SFTTrainer | |
from datasets import Dataset | |
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, TrainingArguments, BitsAndBytesConfig | |
from peft import get_peft_model, LoraConfig | |
from src.logging import logger | |
from src.models.entity import (ModelTrainerConfig, | |
LoraParameters, | |
BitsAndBytesParameters, | |
TrainingArgumentsParameters) | |
class ModelTrainer: | |
def __init__(self, model_trainer_config: ModelTrainerConfig, lora_parameters: LoraParameters, bits_and_bytes_parameters: BitsAndBytesParameters, training_arguments: TrainingArgumentsParameters): | |
self.model_trainer_config = model_trainer_config | |
self.lora_parameters = lora_parameters | |
self.bits_and_bytes_parameters = bits_and_bytes_parameters | |
self.training_arguments = training_arguments | |
def __load_data(self): | |
train_dataset = pd.read_csv(os.path.join(self.model_trainer_config.data_path, "train_dataset.csv")) | |
eval_dataset = pd.read_csv(os.path.join(self.model_trainer_config.data_path, "eval_dataset.csv")) | |
train_dataset = Dataset.from_pandas(train_dataset) | |
eval_dataset = Dataset.from_pandas(eval_dataset) | |
self.train_dataset = train_dataset | |
self.eval_dataset = eval_dataset | |
logger.info("Data loaded") | |
def __initialize_tokenizer(self, model_name: str): | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_trainer_config.base_model) | |
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
logger.info("Tokenizer initialized") | |
def __initialize_lora(self): | |
self.lora_cofig = LoraConfig( | |
r = self.lora_parameters.r, | |
target_modules = self.lora_parameters.target_modules, | |
lora_alpha = self.lora_parameters.lora_alpha, | |
lora_dropout = self.lora_parameters.lora_dropout, | |
bias = self.lora_parameters.bias, | |
task_type = self.lora_parameters.task_type | |
) | |
logger.info("Lora initialized") | |
def __initialize_bits_and_bytes(self): | |
self.nf4_config = BitsAndBytesConfig( | |
load_in_4bit = self.bits_and_bytes_parameters.load_in_4bit, | |
bnb_4bit_quant_type = self.bits_and_bytes_parameters.bnb_4bit_quant_type, | |
bnb_4bit_use_double_quant = self.bits_and_bytes_parameters.bnb_4bit_use_double_quant, | |
bnb_4bit_compute_dtype = torch.bfloat16 | |
) | |
logger.info("Bits and bytes initialized") | |
def __initialize_training_arguments(self): | |
self.training_args = TrainingArguments( | |
output_dir = self.training_arguments.output_dir, | |
evaluation_strategy = self.training_arguments.evaluation_strategy, | |
save_strategy = self.training_arguments.save_strategy, | |
num_train_epochs = self.training_arguments.num_train_epochs, | |
per_device_train_batch_size = self.training_arguments.per_device_train_batch_size, | |
gradient_accumulation_steps = self.training_arguments.gradient_accumulation_steps, | |
optim = self.training_arguments.optim, | |
learning_rate = self.training_arguments.learning_rate, | |
fp16 = self.training_arguments.fp16, | |
max_grad_norm = self.training_arguments.max_grad_norm, | |
warmup_ratio = self.training_arguments.warmup_ratio, | |
group_by_length = self.training_arguments.group_by_length, | |
lr_scheduler_type = self.training_arguments.lr_scheduler_type | |
) | |
logger.info("Training arguments initialized") | |
def __create_model(self): | |
self.model = LlamaForCausalLM.from_pretrained( | |
self.model_trainer_config.base_model, device_map='auto', quantization_config=self.nf4_config, | |
) | |
self.model = get_peft_model(self.model, self.lora_config) | |
#self.model.print_trainable_parameters() | |
logger.info("Model created") | |
def __evaluate(self, trainer): | |
evaluation_results = trainer.evaluate() | |
logger.info(f"Perplexity: {math.exp(evaluation_results['eval_loss']):.2f}") | |
def __save_model(self): | |
self.model.save_pretrained(os.path.join(self.config.root_dir, f"{self.model_trainer_config.base_model}-math-model")) | |
logger.info("Model saved") | |
def __save_tokenizer(self): | |
self.tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer")) | |
logger.info("Tokenizer saved") | |
def train(self): | |
if self.model_trainer_config.upload_from_hf: | |
logger.info("Uploading model from HuggingFace") | |
self.__initialize_tokenizer(self.model_trainer_config.hf_model_name) | |
self.__initialize_bits_and_bytes() | |
self.model = AutoModel.from_pretrained(self.model_trainer_config.hf_model_name, device_map='auto', quantization_config=self.nf4_config) | |
self.__save_model() | |
self.__save_tokenizer() | |
return None | |
if torch.cuda.is_available(): | |
try: | |
locale.getpreferredencoding = lambda: "UTF-8" | |
self.__load_data() | |
self.__initialize_tokenizer(self.model_trainer_config.base_model) | |
self.__initialize_lora() | |
self.__initialize_bits_and_bytes() | |
self.__initialize_training_arguments() | |
self.__create_model() | |
trainer = SFTTrainer(self.model, | |
train_dataset=self.train_dataset, | |
eval_dataset=self.eval_dataset, | |
dataset_text_field="text", | |
max_seq_length=256, | |
args=self.training_args, | |
) | |
logger.info("Trainer created") | |
#Upcast layer norms to float 32 for stability | |
for name, module in trainer.model.named_modules(): | |
if "norm" in name: | |
module = module.to(torch.float32) | |
logger.info("Layer norms upcasted to float32") | |
logger.info(">>>>>>> Training started <<<<<<<<") | |
with mlflow.start_run(run_name=self.model_trainer_config.training_name): | |
trainer.train() | |
logger.info(">>>>>>> Training completed <<<<<<<<") | |
self.__evaluate(trainer) | |
self.__save_model() | |
self.__save_tokenizer() | |
except Exception as e: | |
raise e | |
else: | |
raise Exception("No GPU found") |