|
from datasets import load_dataset |
|
from trl import SFTTrainer |
|
from peft import LoraConfig, get_peft_model |
|
|
|
import os |
|
from uuid import uuid4 |
|
import pandas as pd |
|
|
|
import subprocess |
|
import transformers |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
import evaluate |
|
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score |
|
|
|
from datasets import load_dataset |
|
from trl import SFTTrainer |
|
from peft import LoraConfig, get_peft_model |
|
|
|
|
|
def max_token_len(dataset): |
|
max_seq_length = 0 |
|
for row in dataset: |
|
tokens = len(tokenizer(row['text'])['input_ids']) |
|
if tokens > max_seq_length: |
|
max_seq_length = tokens |
|
return max_seq_length |
|
|
|
|
|
|
|
model_name = 'mistralai/Mistral-7B-v0.1' |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model_max_length = tokenizer.model_max_length |
|
print("Model Max Length:", model_max_length) |
|
|
|
|
|
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100' |
|
dataset = load_dataset(dataset_name) |
|
|
|
|
|
|
|
data_directory = './fine_tune_data/' |
|
|
|
|
|
os.makedirs(data_directory, exist_ok=True) |
|
|
|
|
|
train_data='train_data' |
|
train_filename = os.path.join(data_directory, train_data) |
|
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False) |
|
max_token_length_train=max_token_len(dataset['train']) |
|
print('Max token length train: '+str(max_token_length_train)) |
|
|
|
|
|
validation_data='validation_data' |
|
validation_filename = os.path.join(data_directory, validation_data) |
|
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False) |
|
max_token_length_validation=max_token_len(dataset['validation']) |
|
print('Max token length validation: '+str(max_token_length_validation)) |
|
|
|
max_token_length=max(max_token_length_train,max_token_length_validation) |
|
|
|
if max_token_length > model_max_length: |
|
raise ValueError("Maximum token length exceeds model limits.") |
|
block_size=2*max_token_length |
|
print('Block size: '+str(block_size)) |
|
|
|
|
|
username='ai-aerospace' |
|
project_name='./llms/'+'ams_data_train-100_'+str(uuid4()) |
|
repo_name='ams-data-train-100-'+str(uuid4()) |
|
|
|
model_params={ |
|
"project_name": project_name, |
|
"model_name": model_name, |
|
"repo_id": username+'/'+repo_name, |
|
"train_data": train_data, |
|
"validation_data": validation_data, |
|
"data_directory": data_directory, |
|
"block_size": block_size, |
|
"model_max_length": max_token_length, |
|
"logging_steps": -1, |
|
"evaluation_strategy": "epoch", |
|
"save_total_limit": 1, |
|
"save_strategy": "epoch", |
|
"mixed_precision": "fp16", |
|
"lr": 0.00003, |
|
"epochs": 3, |
|
"batch_size": 2, |
|
"warmup_ratio": 0.1, |
|
"gradient_accumulation": 1, |
|
"optimizer": "adamw_torch", |
|
"scheduler": "linear", |
|
"weight_decay": 0, |
|
"max_grad_norm": 1, |
|
"seed": 42, |
|
"quantization": "int4", |
|
"lora_r": 16, |
|
"lora_alpha": 32, |
|
"lora_dropout": 0.05 |
|
} |
|
for key, value in model_params.items(): |
|
os.environ[key] = str(value) |
|
|
|
print(model_params) |
|
|
|
args_custom=transformers.TrainingArguments( |
|
per_device_train_batch_size=model_params['batch_size'], |
|
per_device_eval_batch_size=model_params['batch_size'], |
|
gradient_accumulation_steps=model_params['gradient_accumulation'], |
|
warmup_ratio=model_params['warmup_ratio'], |
|
num_train_epochs=model_params['epochs'], |
|
learning_rate=model_params['lr'], |
|
fp16=True, |
|
logging_steps=model_params['logging_steps'], |
|
save_total_limit=model_params['save_total_limit'], |
|
evaluation_strategy=model_params['evaluation_strategy'], |
|
metric_for_best_model="f1", |
|
output_dir='model_outputs', |
|
logging_dir='model_outputs', |
|
optim=model_params['optimizer'], |
|
max_grad_norm=model_params['max_grad_norm'], |
|
weight_decay=model_params['weight_decay'], |
|
lr_scheduler_type=model_params['scheduler'] |
|
) |
|
|
|
|
|
args_medium=transformers.TrainingArguments( |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=32, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=100, |
|
max_steps=12276, |
|
learning_rate=2e-4, |
|
fp16=True, |
|
eval_steps= 1000, |
|
logging_steps=1000, |
|
save_steps=1000, |
|
evaluation_strategy="steps", |
|
do_eval=True, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="f1", |
|
output_dir='model_outputs', |
|
logging_dir='model_outputs', |
|
remove_unused_columns =False, |
|
report_to='wandb' |
|
) |
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_4bit=True |
|
) |
|
peft_config = LoraConfig( |
|
r=model_params['lora_r'], |
|
lora_alpha=model_params['lora_alpha'], |
|
lora_dropout=model_params['lora_dropout'] |
|
) |
|
lora_model = get_peft_model(model, peft_config) |
|
lora_model.print_trainable_parameters() |
|
|
|
|
|
f1_metric = evaluate.load("f1") |
|
recall_metric = evaluate.load("recall") |
|
accuracy_metric = evaluate.load("accuracy") |
|
precision_metric = evaluate.load("precision") |
|
|
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = np.argmax(logits, axis=-1) |
|
results = {} |
|
results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro")) |
|
results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro")) |
|
results.update(accuracy_metric.compute(predictions=predictions, references = labels)) |
|
results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro")) |
|
|
|
return results |
|
|
|
|
|
trainer = transformers.Trainer( |
|
model=lora_model, |
|
train_dataset=model_params['train_data'], |
|
eval_dataset=model_params['validation_data'], |
|
compute_metrics=compute_metrics, |
|
args=args_custom |
|
) |
|
trainer.train() |