# T o load the dataset 
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser,TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import torch


# Setting up the model and tokenizer 
data_name = "mlabonne/guanaco-llama2-1k"
training_data = load_dataset(data_name,split='train')

# Model and tokenizer names 
base_model_name = "NousResearch/Llama-2-7b-chat-hf"


#Tokenizer

llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = 'right'

# Quantization Config 

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model 
base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config = quant_config,
        device_map='auto'
)
base_model.config.use_cache=False
base_model.config.pretraining_tp=1 # tensor parallelism rank

''''
Double quantization is a technique where weights are quantized twice with different quantization parameters, 
potentially improving the accuracy of the quantized model. However, it may also increase computational complexity.
'''

'''
LoRA-Specific Parameters

Dropout Rate (lora_dropout): This is the probability that each neuron’s output is set to zero during training, used to prevent overfitting.
Rank (r): Rank is essentially a measure of how the original weight matrices are broken down into simpler, smaller matrices. This reduces 
        computational requirements and memory consumption. Lower ranks make the model faster but might sacrifice performance. The original LoRA paper 
        suggests starting with a rank of 8, but for QLoRA, a rank of 64 is required.

lora_alpha: This parameter controls the scaling of the low-rank approximation. It’s like a balancing act between the original model and the low-rank approximation. 
        Higher values might make the approximation more influential in the fine-tuning process, affecting both performance and computational cost.

'''

# Lora Config 
peft_config = LoraConfig(lora_alpha=16,
                         lora_dropout=0.1,
                         r=8,
                         bias='none',
                         task_type='CAUSAL_LM')


# Training args 
train_params = TrainingArguments(
    output_dir="./",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

# Trainer 
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# call the train function 
fine_tuning.train()

# save  the model 
fine_tuning.save_model("llama_7b_james")

# Upload to hugging_face
model_name = "llama7b__finetune_sample"
HUGGING_FACE_USER_NAME = "james92"

fine_tuning.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}")

print("Model is saved in hggingface")