import signal import sys from datasets import load_dataset from transformers import TrainingArguments from trl import SFTTrainer import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import LoraConfig from schedulefree import AdamWScheduleFree # Signal handler function def signal_handler(sig, frame): print('You pressed Ctrl+C! Exiting...') sys.exit(0) # Register signal handler signal.signal(signal.SIGINT, signal_handler) dataset = load_dataset("Crystalcareai/Orca-Reka")['train'] def chatml_format(example): """Format the dataset for training, accounting for empty columns.""" return { "instruction": example['instruction'] if 'instruction' in example else " \n", "input": example['input'] if 'input' in example else " \n", "system": example['system'] if 'system' in example else " \n", "output": example['output'] if 'output' in example else " \n", } # Format dataset dataset = dataset.map(chatml_format, remove_columns=dataset.column_names) # Load model and tokenizer model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(model) tokenizer.padding_side = 'right' # to prevent warnings peft_config = LoraConfig( lora_alpha=16, lora_dropout=0.05, r=32, bias="none", target_modules=[ "0.w1", "0.w2", "0.w3", "q_proj", "v_proj", "k_proj", "o_proj" ], task_type="CAUSAL_LM", use_dora=False, # Enable Dora method ) args = TrainingArguments( output_dir="./out", # directory to save and repository id num_train_epochs=3, # number of training epochs per_device_train_batch_size=4, # batch size per device during training gradient_checkpointing=True, # use gradient checkpointing to save memory optim="adamw_hf", logging_steps=2, save_strategy="steps", save_steps=300, bf16=True, # use bfloat16 precision tf32=True, # use tf32 precision ### peft specific arguments ### learning_rate=2e-4, max_grad_norm=0.3, warmup_ratio=0.00, lr_scheduler_type="constant", report_to="wandb", push_to_hub=False, # push model to hub ) max_seq_length = 2048 # max sequence length for model and packing of the dataset # Create the schedulefree optimizer optimizer = AdamWScheduleFree(model.parameters(), lr=args.learning_rate, beta=0.9) trainer = SFTTrainer( model=model, args=args, train_dataset=dataset, ### peft specific arguments ### peft_config=peft_config, max_seq_length=max_seq_length, tokenizer=tokenizer, packing=False, optimizers=(optimizer, None), # Pass the schedulefree optimizer ) # start training, the model will be automatically saved to the hub and the output directory trainer.train() # save model trainer.save_model()