import torch
torch.jit.script = lambda f: f
import spaces
import gradio as gr
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AwqConfig
import torch
import os
import bitnet

key =  os.environ.get("key")
from huggingface_hub import login 
login(key)
from bitnet import replace_linears_in_hf
os.system("mkdir c4ai-command-r-v01-exl2")
os.system("huggingface-cli download bartowski/c4ai-command-r-v01-exl2 --revision 6_5 --local-dir c4ai-command-r-v01-exl2 --local-dir-use-symlinks False")
# os.system("pip install flash-attn --no-build-isolation")
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "CohereForAI/c4ai-command-r-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             # load_in_8bit=True,
                                             quantization_config=nf4_config,
                                            # attn_implementation="flash_attention_2",
                                             # torch_dtype = torch.bfloat16,
                                             device_map="auto"
                                            )

# replace_linears_in_hf(model)
model.eval()
@spaces.GPU
def generate_response(user_input, max_new_tokens, temperature):
    os.system("nvidia-smi")
    messages = [{"role": "user", "content": user_input}]
    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    os.system("nvidia-smi")
    gen_tokens = model.generate(
        input_ids = input_ids, 
        max_new_tokens=max_new_tokens,
        do_sample=True, 
        temperature=temperature,
    )

    gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
    if gen_text.startswith(user_input):
        gen_text = gen_text[len(user_input):].lstrip()

    return gen_text


examples = [
        {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
        {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
        {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
]
example_choices = [f"Example {i+1}" for i in range(len(examples))]

def load_example(choice):
    index = example_choices.index(choice)
    example = examples[index]
    return example["message"], example["max_new_tokens"], example["temperature"]


with gr.Blocks() as demo:
    with gr.Row():
        max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
        temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
    message_box = gr.Textbox(lines=2, label="Your Message")
    generate_button = gr.Button("Try🫡Command-R")
    output_box = gr.Textbox(label="🫡Command-R")

    generate_button.click(
        fn=generate_response,
        inputs=[message_box, max_new_tokens_slider, temperature_slider],
        outputs=output_box
    )
    example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
    example_button = gr.Button("🫡Load")
    example_button.click(
        fn=load_example,
        inputs=example_dropdown,
        outputs=[message_box, max_new_tokens_slider, temperature_slider]
    )

demo.launch()