In [1]:
import sys
# import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig


 from .autonotebook import tqdm as notebook_tqdm


In [2]:
training_config = {
 "bf16": False,
 "do_eval": False,
 "learning_rate": 5.0e-06,
 "log_level": "info",
 "logging_steps": 50,
 "logging_strategy": "steps",
 "lr_scheduler_type": "cosine",
 "num_train_epochs": 1,
 "max_steps": -1,
 "output_dir": "./checkpoint_dir",
 "overwrite_output_dir": True,
 "per_device_eval_batch_size": 4, # Reduce batch size to lower memory usage
 "per_device_train_batch_size": 8, # Reduce batch size to lower memory usage
 "remove_unused_columns": True,
 "save_steps": 500,
 "save_total_limit": 1,
 "seed": 0,
 "gradient_checkpointing": False,
 "gradient_checkpointing_kwargs":{"use_reentrant": False},
 "gradient_accumulation_steps": 1,
 "warmup_ratio": 0.2,
}

peft_config = {
 "r": 16,
 "lora_alpha": 32,
 "lora_dropout": 0.05,
 "bias": "none",
 "task_type": "CAUSAL_LM",
 "target_modules": "all-linear",
 "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [3]:
################
# Model Loading
################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
 use_cache=False,
 trust_remote_code=True,
 attn_implementation="flash_attention_2", # loading the model with flash-attention support
 torch_dtype=torch.float16, # Changed to float16
 device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [05:34<00:00, 167.47s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Phi3ForCausalLM(
 (model): Phi3Model(
 (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
 (embed_dropout): Dropout(p=0.0, inplace=False)
 (layers): ModuleList(
 (0-31): 32 x Phi3DecoderLayer(
 (self_attn): Phi3FlashAttention2(
 (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
 (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
 (rotary_emb): Phi3RotaryEmbedding()
 )
 (mlp): Phi3MLP(
 (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
 (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
 (activation_fn): SiLU()
 )
 (input_layernorm): Phi3RMSNorm()
 (resid_attn_dropout): Dropout(p=0.0, inplace=False)
 (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
 (post_attention_layernorm): Phi3RMSNorm()
 )
 )
 (norm): Phi3RMSNorm()
 )
 (lm_head): Linear(in_features=3072, out_features=32064, bias=False)
)

In [4]:
##################
# Data Processing
##################
def apply_chat_template(example, tokenizer):
 messages = example["messages"]
 # Add an empty system message if there is none
 if messages[0]["role"] != "system":
 messages.insert(0, {"role": "system", "content": ""})
 example["text"] = tokenizer.apply_chat_template(
 messages, tokenize=False, add_generation_prompt=False)
 return example

raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
 apply_chat_template,
 fn_kwargs={"tokenizer": tokenizer},
 num_proc=10,
 remove_columns=column_names,
 desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
 apply_chat_template,
 fn_kwargs={"tokenizer": tokenizer},
 num_proc=10,
 remove_columns=column_names,
 desc="Applying chat template to test_sft",
)

In [5]:
###########
# Training
###########
trainer = SFTTrainer(
 model=model,
 args=train_conf,
 peft_config=peft_conf,
 train_dataset=processed_train_dataset,
 eval_dataset=processed_test_dataset,
 max_seq_length=2048,
 dataset_text_field="text",
 tokenizer=tokenizer,
 packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** Running training *****
 Num examples = 140,320
 Num Epochs = 1
 Instantaneous batch size per device = 8
 Total train batch size (w. parallel, distributed & accumulation) = 8
 Gradient Accumulation steps = 1
 Total optimization steps = 17,540
 Number of trainable parameters = 25,165,824
 0%| | 0/17540 [00:00 15[0m train_result [38;5;241m=[39m [43mtrainer[49m[38;5;241;43m.[39;49m[43mtrain[49m[43m([49m[43m)[49m
[0;32m 16[0m metrics [38;5;241m=[39m train_result[38;5;241m.[39mmetrics
[0;32m 17[0m trainer[38;5;241m.[39mlog_metrics([38;5;124m"[39m[38;5;124mtrain[39m[38;5;124m"[39m, metrics)
File [1;32me:\Users\frink\Documents\GitHub\LLM Things\Phi-3-training-Low-Ram\venv\lib\site-packages\trl\trainer\sft_trainer.py:361[0m, in [0;36mSFTTrainer.train[1;34m(self, *args, **kwargs)[0m
[0;32m 358[0m [38;5;28;01mif[39;00m [38;5;28mself[39m[38;5;241m.[39mneftune_noise_alpha [38;5;129;01mis[39;00m [38;5;129;01mnot[39;00m [38;5;28;01mNone[39;00m [3

In [None]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
############
# Save model
############
trainer.save_model(train_conf.output_dir)