import signal
import sys
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from schedulefree import AdamWScheduleFree

# Signal handler function
def signal_handler(sig, frame):
    print('You pressed Ctrl+C! Exiting...')
    sys.exit(0)

# Register signal handler
signal.signal(signal.SIGINT, signal_handler)

dataset = load_dataset("Crystalcareai/Orca-Reka")['train']

def chatml_format(example):
    """Format the dataset for training, accounting for empty columns."""
    return {
        "instruction": example['instruction'] if 'instruction' in example else " \n",
        "input": example['input'] if 'input' in example else " \n",
        "system": example['system'] if 'system' in example else " \n",
        "output": example['output'] if 'output' in example else " \n",
    }

# Format dataset
dataset = dataset.map(chatml_format, remove_columns=dataset.column_names)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.padding_side = 'right'  # to prevent warnings

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    target_modules=[
        "0.w1",
        "0.w2",
        "0.w3",
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj"
    ],
    task_type="CAUSAL_LM",
    use_dora=False,  # Enable Dora method
)

args = TrainingArguments(
    output_dir="./out",  # directory to save and repository id
    num_train_epochs=3,  # number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    gradient_checkpointing=True,  # use gradient checkpointing to save memory
    optim="adamw_hf",
    logging_steps=2,
    save_strategy="steps",
    save_steps=300,
    bf16=True,  # use bfloat16 precision
    tf32=True,  # use tf32 precision
    ### peft specific arguments ###
    learning_rate=2e-4,
    max_grad_norm=0.3,
    warmup_ratio=0.00,
    lr_scheduler_type="constant",
    report_to="wandb",
    push_to_hub=False,
    # push model to hub
)

max_seq_length = 2048  # max sequence length for model and packing of the dataset

# Create the schedulefree optimizer
optimizer = AdamWScheduleFree(model.parameters(), lr=args.learning_rate, beta=0.9)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    ### peft specific arguments ###
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=False,
    optimizers=(optimizer, None),  # Pass the schedulefree optimizer
)

# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()