# https://www.gradio.app/guides/using-hugging-face-integrations

import gradio as gr
import logging
import html
import time
import torch
from   threading import Thread
from   transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

# Model
model_name = "augmxnt/shisa-7b-v1"

# UI Settings
title = "Shisa 7B"
description = "Test out Shisa 7B in either English or Japanese."
placeholder = "Type Here / ここに入力してください" 
examples = [
    "What's the best ramen in Tokyo?",
    "あなたは熱狂的なポケモンファンです。",
    "東京でおすすめのラーメン屋ってどこ？",
]

# LLM Settings
system_prompt = 'あなたは役に立つアシスタントです。'
chat_history = [{"role": "system", "content": system_prompt}]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True,
    # load_in_4bit=True
)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

def chat(message, history):
    chat_history.append({"role": "user", "content": message})
    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
    # for multi-gpu, find the device of the first parameter of the model
    first_param_device = next(model.parameters()).device
    input_ids = input_ids.to(first_param_device)

    generate_kwargs = dict(
        inputs=input_ids,
        streamer=streamer,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        repetition_penalty=1.15,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
    )
    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token # html.escape(new_token)
        yield partial_message


chat_interface = gr.ChatInterface(
    chat,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
    title=title,
    description=description,
    theme="soft",
    examples=examples,
    cache_examples=False,
    undo_btn="Delete Previous",
    clear_btn="Clear",
)

# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
with gr.Blocks() as demo:
    chat_interface.render()
    gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")

demo.queue().launch()