|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
|
|
from transformers import EarlyStoppingCallback
|
|
from peft import LoraConfig
|
|
from trl import SFTTrainer
|
|
from datasets import load_dataset
|
|
import os
|
|
|
|
NAME_OF_MODEL = "./merged_tinyllama_logger"
|
|
DATASET_PATH = "/app/data/log_dataset.jsonl"
|
|
OUTPUT_DIR = "/app/model_output/incremental_1_logs"
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit = True,
|
|
bnb_4bit_quant_type = "nf4",
|
|
bnb_4bit_compute_dtype = torch.float16,
|
|
bnb_4bit_use_double_quant=True
|
|
)
|
|
|
|
lora_config = LoraConfig(
|
|
r=32,
|
|
lora_alpha=124,
|
|
bias="none",
|
|
lora_dropout=0.15,
|
|
task_type="CAUSAL_LM"
|
|
)
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir = OUTPUT_DIR,
|
|
per_device_train_batch_size=4,
|
|
gradient_accumulation_steps=16,
|
|
learning_rate=1e-4,
|
|
weight_decay=0.001,
|
|
bf16=False,
|
|
max_grad_norm=0.3,
|
|
max_steps=-1,
|
|
warmup_ratio=0.03,
|
|
group_by_length=True,
|
|
lr_scheduler_type="cosine",
|
|
num_train_epochs=4,
|
|
logging_steps=10,
|
|
save_steps=25,
|
|
fp16=True,
|
|
optim="paged_adamw_8bit",
|
|
report_to=["tensorboard"],
|
|
eval_strategy="steps",
|
|
eval_steps=25,
|
|
load_best_model_at_end=True,
|
|
metric_for_best_model="eval_loss",
|
|
greater_is_better=False
|
|
)
|
|
|
|
try:
|
|
dataset = load_dataset("json", data_files=DATASET_PATH)
|
|
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
|
|
train_dataset = split_dataset["train"]
|
|
eval_dataset = split_dataset["test"]
|
|
except Exception as e:
|
|
print(f"error loading dataset from {DATASET_PATH}: {e}")
|
|
exit(1)
|
|
|
|
print("Loading model with Quantization")
|
|
|
|
try:
|
|
model=AutoModelForCausalLM.from_pretrained(
|
|
NAME_OF_MODEL,
|
|
quantization_config = bnb_config,
|
|
device_map="auto",
|
|
trust_remote_code = True,
|
|
torch_dtype = torch.float16
|
|
)
|
|
model.config.pretraining_p=1
|
|
print("Model loaded successfully")
|
|
except Exception as e:
|
|
print("ERROR LOADING MODEL: {e}")
|
|
exit(1)
|
|
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
tokenizer.padding_side = "right"
|
|
except Exception as e:
|
|
print('ERROR LOADING TOKENIZER: {e}')
|
|
exit(1)
|
|
|
|
trainer=SFTTrainer(
|
|
model=model,
|
|
train_dataset= train_dataset,
|
|
eval_dataset=eval_dataset,
|
|
peft_config = lora_config,
|
|
dataset_text_field="text",
|
|
max_seq_length = 512,
|
|
tokenizer = tokenizer,
|
|
args=training_args,
|
|
packing=False,
|
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
|
|
)
|
|
|
|
print("training started")
|
|
trainer.train()
|
|
print("fine tuning complete")
|
|
|
|
trainer.save_model(OUTPUT_DIR)
|
|
|
|
|