pip install --upgrade pip
pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"

# nvcc
# python -m xformers.info
# python -m bitsandbytes
from unsloth import FastLanguageModel

import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False


from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="DipeshChaudhary/ShareGPTChatBot-Counselchat1",  # Your fine-tuned model
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


from unsloth.chat_templates import get_chat_template


tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)


import re
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "hlo"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)


x= model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)


# Function to generate response
def generate_response(conversation_history):

    inputs = tokenizer.apply_chat_template(conversation_history,
                                           tokenize = True,
                                           add_generation_prompt = True, # Must add for generation
                                           return_tensors = "pt",
                                          ).to("cuda")
    text_streamer = TextStreamer(tokenizer)
    
    
    # Set the pad_token_id to the eos_token_id if it's not set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Generate the response
    output = model.generate(
        inputs,
        max_new_tokens=10000,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        attention_mask=inputs.ne(tokenizer.pad_token_id)
    )
    
    # Decode the output, skipping special tokens
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract only the bot's response
    bot_response = decoded_output.split("assistant")[-1].strip()
    
    return bot_response

# Example usage
conversation_history = []
while True:
    user_input = input("User: ")
    if user_input.lower() == "exit":
        print("Exiting...")
        break
    
    # Append user message to history
    conversation_history.append({"from": "human", "value": user_input})
    
    # Generate response
    response = generate_response(conversation_history)
    
    # Append bot response to history
    conversation_history.append({"from": "bot", "value": response})
    
    #Print bot's response
    print("Bot:", response)