# https://www.gradio.app/guides/using-hugging-face-integrations

import gradio as gr
import logging
import html
from   pprint import pprint
import time
import torch
from   threading import Thread
from   transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer

# Model
model_name = "augmxnt/shisa-7b-v1"

# UI Settings
title = "Shisa 7B"
description = "Test out <a href='https://huggingface.co/augmxnt/shisa-7b-v1'>Shisa 7B</a> in either English or Japanese. If you aren't getting the right language outputs, you can try changing the system prompt to the appropriate language.\n\nNote: we are running this model quantized at `load_in_4bit` to fit in 16GB of VRAM."
placeholder = "Type Here / ここに入力してください" 
examples = [
    ["What are the best slices of pizza in New York City?"],
    ["東京でおすすめのラーメン屋ってどこ？"],
    ['How do I program a simple "hello world" in Python?'],
    ["Pythonでシンプルな「ハローワールド」をプログラムするにはどうすればいいですか？"],
]

# LLM Settings
# Initial
system_prompt = 'You are a helpful, bilingual assistant. Reply in same language as the user.'
default_prompt = system_prompt

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # load_in_8bit=True,
    load_in_4bit=True,
    use_flash_attention_2=True,
)

def chat(message, history, system_prompt):
    if not system_prompt:
        system_prompt = default_prompt

    print('---')
    print('Prompt:', system_prompt)
    pprint(history)
    print(message)

    # Let's just rebuild every time it's easier
    chat_history = [{"role": "system", "content": system_prompt}]
    for h in history:
        chat_history.append({"role": "user", "content": h[0]})
        chat_history.append({"role": "assistant", "content": h[1]})
    chat_history.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")

    # for multi-gpu, find the device of the first parameter of the model
    first_param_device = next(model.parameters()).device
    input_ids = input_ids.to(first_param_device)

    generate_kwargs = dict(
        inputs=input_ids,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        repetition_penalty=1.15,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    output_ids = model.generate(**generate_kwargs)
    new_tokens = output_ids[0, input_ids.size(1):]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True) 
    return response


chat_interface = gr.ChatInterface(
    chat,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
    title=title,
    description=description,
    theme="soft",
    examples=examples,
    cache_examples=False,
    undo_btn="Delete Previous",
    clear_btn="Clear",
    additional_inputs=[
        gr.Textbox(system_prompt, label="System Prompt (Change the language of the prompt for better replies)"),
    ],
)

# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
with gr.Blocks() as demo:
    chat_interface.render()
    gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")

demo.queue().launch()