from datasets import load_dataset from trl import SFTTrainer from peft import LoraConfig, get_peft_model import os from uuid import uuid4 import pandas as pd import subprocess import transformers from transformers import AutoModelForCausalLM, AutoTokenizer import evaluate from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score from datasets import load_dataset from trl import SFTTrainer from peft import LoraConfig, get_peft_model ### Define functions def max_token_len(dataset): max_seq_length = 0 for row in dataset: tokens = len(tokenizer(row['text'])['input_ids']) if tokens > max_seq_length: max_seq_length = tokens return max_seq_length ### Set up models and datasets, training parameters # model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1' model_name = 'mistralai/Mistral-7B-v0.1' # model_name = 'distilbert-base-uncased' tokenizer = AutoTokenizer.from_pretrained(model_name) model_max_length = tokenizer.model_max_length print("Model Max Length:", model_max_length) # dataset = load_dataset("imdb", split="train") dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100' dataset = load_dataset(dataset_name) # Write dataset files into data directory data_directory = './fine_tune_data/' # Create the data directory if it doesn't exist os.makedirs(data_directory, exist_ok=True) # Write the train data to a CSV file train_data='train_data' train_filename = os.path.join(data_directory, train_data) dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False) max_token_length_train=max_token_len(dataset['train']) print('Max token length train: '+str(max_token_length_train)) # Write the validation data to a CSV file validation_data='validation_data' validation_filename = os.path.join(data_directory, validation_data) dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False) max_token_length_validation=max_token_len(dataset['validation']) print('Max token length validation: '+str(max_token_length_validation)) max_token_length=max(max_token_length_train,max_token_length_validation) # max_token_length=max_token_length_train if max_token_length > model_max_length: raise ValueError("Maximum token length exceeds model limits.") block_size=2*max_token_length print('Block size: '+str(block_size)) # Define project parameters username='ai-aerospace' project_name='./llms/'+'ams_data_train-100_'+str(uuid4()) repo_name='ams-data-train-100-'+str(uuid4()) model_params={ "project_name": project_name, "model_name": model_name, "repo_id": username+'/'+repo_name, "train_data": train_data, "validation_data": validation_data, "data_directory": data_directory, "block_size": block_size, "model_max_length": max_token_length, "logging_steps": -1, "evaluation_strategy": "epoch", "save_total_limit": 1, "save_strategy": "epoch", "mixed_precision": "fp16", "lr": 0.00003, "epochs": 3, "batch_size": 2, "warmup_ratio": 0.1, "gradient_accumulation": 1, "optimizer": "adamw_torch", "scheduler": "linear", "weight_decay": 0, "max_grad_norm": 1, "seed": 42, "quantization": "int4", "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05 } for key, value in model_params.items(): os.environ[key] = str(value) print(model_params) args_custom=transformers.TrainingArguments( per_device_train_batch_size=model_params['batch_size'], per_device_eval_batch_size=model_params['batch_size'], gradient_accumulation_steps=model_params['gradient_accumulation'], warmup_ratio=model_params['warmup_ratio'], num_train_epochs=model_params['epochs'], learning_rate=model_params['lr'], fp16=True, logging_steps=model_params['logging_steps'], save_total_limit=model_params['save_total_limit'], evaluation_strategy=model_params['evaluation_strategy'], metric_for_best_model="f1", output_dir='model_outputs', logging_dir='model_outputs', optim=model_params['optimizer'], max_grad_norm=model_params['max_grad_norm'], weight_decay=model_params['weight_decay'], lr_scheduler_type=model_params['scheduler'] ) ### Args from medium article args_medium=transformers.TrainingArguments( per_device_train_batch_size=8, per_device_eval_batch_size=32, gradient_accumulation_steps=4, warmup_steps=100, max_steps=12276, learning_rate=2e-4, fp16=True, eval_steps= 1000, logging_steps=1000, save_steps=1000, evaluation_strategy="steps", do_eval=True, load_best_model_at_end=True, metric_for_best_model="f1", output_dir='model_outputs', logging_dir='model_outputs', remove_unused_columns =False, report_to='wandb' # enable logging to W&B ) ### ### Load model and peft config, calculate trainable parameters model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit=True ) peft_config = LoraConfig( r=model_params['lora_r'], lora_alpha=model_params['lora_alpha'], lora_dropout=model_params['lora_dropout'] ) lora_model = get_peft_model(model, peft_config) lora_model.print_trainable_parameters() ### Train model f1_metric = evaluate.load("f1") recall_metric = evaluate.load("recall") accuracy_metric = evaluate.load("accuracy") precision_metric = evaluate.load("precision") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) results = {} results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro")) results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro")) results.update(accuracy_metric.compute(predictions=predictions, references = labels)) results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro")) return results # See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details trainer = transformers.Trainer( model=lora_model, train_dataset=model_params['train_data'], eval_dataset=model_params['validation_data'], compute_metrics=compute_metrics, args=args_custom ) trainer.train()