goingnowhere / train_csv_dataset_phi-2-super.py
robkaandorp's picture
Add phi-2-super training script
aeddf48
raw history blame
No virus
3.51 kB
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training
dataset = load_dataset()
if torch.cuda.is_available():
print("Cuda is available")
base_model_id = "abacaj/phi-2-super"
output_dir = "./results_phi-2-super"
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("pad_token was missing and has been set to eos_token")
# Configuration to load model in 4-bit quantized
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_quant_type='nf4',
#bnb_4bit_compute_dtype='float16',
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False)
model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto")
print(model)
# Gradient checkpointing to save memory
model.gradient_checkpointing_enable()
# Freeze base model layers and cast layernorm in fp32
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
peft_config = LoraConfig(
r=64,
lora_alpha=64,
target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"],
bias="none",
lora_dropout=0.05,
task_type="CAUSAL_LM",
)
training_args = TrainingArguments(
output_dir=output_dir, # Output directory for checkpoints and predictions
overwrite_output_dir=True, # Overwrite the content of the output directory
per_device_train_batch_size=2, # Batch size for training
per_device_eval_batch_size=2, # Batch size for evaluation
gradient_accumulation_steps=5, # number of steps before optimizing
gradient_checkpointing=True, # Enable gradient checkpointing
gradient_checkpointing_kwargs={"use_reentrant": False},
warmup_steps=10, # Number of warmup steps
#max_steps=1000, # Total number of training steps
num_train_epochs=100, # Number of training epochs
learning_rate=5e-5, # Learning rate
weight_decay=0.01, # Weight decay
optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
bf16=True, #Use mixed precision training
#For logging and saving
logging_dir='./logs',
logging_strategy="epoch",
logging_steps=10,
save_strategy="epoch",
save_steps=10,
save_total_limit=2, # Limit the total number of checkpoints
evaluation_strategy="epoch",
eval_steps=10,
load_best_model_at_end=True, # Load the best model at the end of training
lr_scheduler_type="linear",
)
def formatting_func(data):
return f"[INST] {data['prompt']} [/INST]{data['completion']}{tokenizer.eos_token} "
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
eval_dataset=dataset,
peft_config=peft_config,
args=training_args,
max_seq_length=1024,
packing=True,
formatting_func=formatting_func
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
start_time = time.time() # Record the start time
trainer.train()
end_time = time.time() # Record the end time
training_time = end_time - start_time # Calculate total training time
trainer.save_model(output_dir)
print(f"Training completed in {training_time} seconds.")