File size: 6,220 Bytes
9e70bac
 
feee6eb
9e70bac
 
 
 
 
 
feee6eb
9e70bac
 
feee6eb
 
 
 
 
 
 
 
9e70bac
 
 
 
 
 
 
 
feee6eb
9e70bac
 
 
 
 
 
 
 
 
feee6eb
 
9e70bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feee6eb
9e70bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e51648a
 
 
 
 
4ec4a3c
e51648a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feee6eb
9e70bac
 
 
 
 
 
 
 
 
feee6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e51648a
9e70bac
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

import os
from uuid import uuid4
import pandas as pd

import subprocess
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

import evaluate
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

### Define functions
def max_token_len(dataset):
    max_seq_length = 0
    for row in dataset:
        tokens = len(tokenizer(row['text'])['input_ids'])
        if tokens > max_seq_length:
            max_seq_length = tokens
    return max_seq_length

### Set up models and datasets, training parameters
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)

# dataset = load_dataset("imdb", split="train")
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset = load_dataset(dataset_name)


# Write dataset files into data directory
data_directory = './fine_tune_data/'

# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)

# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))

# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
      
max_token_length=max(max_token_length_train,max_token_length_validation)
# max_token_length=max_token_length_train
if max_token_length > model_max_length:
    raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
print('Block size: '+str(block_size))

# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())

model_params={
  "project_name": project_name,
  "model_name": model_name,
  "repo_id": username+'/'+repo_name,
  "train_data": train_data,
  "validation_data": validation_data,
  "data_directory": data_directory,
  "block_size": block_size,
  "model_max_length": max_token_length,
  "logging_steps": -1,
  "evaluation_strategy": "epoch",
  "save_total_limit": 1,
  "save_strategy": "epoch",
  "mixed_precision": "fp16",
  "lr": 0.00003,
  "epochs": 3,
  "batch_size": 2,
  "warmup_ratio": 0.1,
  "gradient_accumulation": 1,
  "optimizer": "adamw_torch",
  "scheduler": "linear",
  "weight_decay": 0,
  "max_grad_norm": 1,
  "seed": 42,
  "quantization": "int4",
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.05
}
for key, value in model_params.items():
  os.environ[key] = str(value)

print(model_params)

args_custom=transformers.TrainingArguments(
    per_device_train_batch_size=model_params['batch_size'],
    per_device_eval_batch_size=model_params['batch_size'],
    gradient_accumulation_steps=model_params['gradient_accumulation'],
    warmup_ratio=model_params['warmup_ratio'],
    num_train_epochs=model_params['epochs'],
    learning_rate=model_params['lr'],
    fp16=True,
    logging_steps=model_params['logging_steps'],
    save_total_limit=model_params['save_total_limit'],
    evaluation_strategy=model_params['evaluation_strategy'],
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    optim=model_params['optimizer'],
    max_grad_norm=model_params['max_grad_norm'],
    weight_decay=model_params['weight_decay'],
    lr_scheduler_type=model_params['scheduler']
)

### Args from medium article
args_medium=transformers.TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=12276,
    learning_rate=2e-4,
    fp16=True,
    eval_steps= 1000,
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    output_dir='model_outputs',
    logging_dir='model_outputs',
    remove_unused_columns =False, 
    report_to='wandb'  # enable logging to W&B
)
###

### Load model and peft config, calculate trainable parameters
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True
)
peft_config = LoraConfig(
    r=model_params['lora_r'],
    lora_alpha=model_params['lora_alpha'],
    lora_dropout=model_params['lora_dropout']
)
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

### Train model
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(accuracy_metric.compute(predictions=predictions, references = labels))
    results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro"))

    return results

# See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details
trainer = transformers.Trainer(
    model=lora_model,
    train_dataset=model_params['train_data'],
    eval_dataset=model_params['validation_data'],
    compute_metrics=compute_metrics,
    args=args_custom
)
trainer.train()