File size: 2,908 Bytes
b48a35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from transformers import EarlyStoppingCallback
from peft import LoraConfig
from trl import SFTTrainer
from datasets import load_dataset
import os

NAME_OF_MODEL = "./merged_tinyllama_logger"
DATASET_PATH = "/app/data/log_dataset.jsonl"
OUTPUT_DIR = "/app/model_output/incremental_1_logs"

os.makedirs(OUTPUT_DIR, exist_ok=True)

#QUANTIZATION CONFIGURATION:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant=True
)

lora_config = LoraConfig(
    r=32,
    lora_alpha=124,
    bias="none",
    lora_dropout=0.15,
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    weight_decay=0.001,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    num_train_epochs=4,
    logging_steps=10,
    save_steps=25,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to=["tensorboard"],
    eval_strategy="steps",
    eval_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
    )

try:
    dataset = load_dataset("json", data_files=DATASET_PATH)
    split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
except Exception as e:
    print(f"error loading dataset from {DATASET_PATH}: {e}")
    exit(1)

print("Loading model with Quantization")

try:
    model=AutoModelForCausalLM.from_pretrained(
        NAME_OF_MODEL,
        quantization_config = bnb_config,
        device_map="auto",
        trust_remote_code = True,
        torch_dtype = torch.float16
    )
    model.config.pretraining_p=1
    print("Model loaded successfully")
except Exception as e:
    print("ERROR LOADING MODEL: {e}")
    exit(1)

try:
    tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
except Exception as e:
    print('ERROR LOADING TOKENIZER: {e}')
    exit(1)

trainer=SFTTrainer(
    model=model,
    train_dataset= train_dataset,
    eval_dataset=eval_dataset,
    peft_config = lora_config,
    dataset_text_field="text",
    max_seq_length = 512,
    tokenizer = tokenizer,
    args=training_args,
    packing=False,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
)

print("training started")
trainer.train()
print("fine tuning complete")

trainer.save_model(OUTPUT_DIR)