shisa / app.py
leonardlin's picture
trying fa2 again
3831a9c
# https://www.gradio.app/guides/using-hugging-face-integrations
import gradio as gr
import logging
import html
from pprint import pprint
import time
import torch
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
# Model
model_name = "augmxnt/shisa-7b-v1"
# UI Settings
title = "Shisa 7B"
description = "Test out <a href='https://huggingface.co/augmxnt/shisa-7b-v1'>Shisa 7B</a> in either English or Japanese. If you aren't getting the right language outputs, you can try changing the system prompt to the appropriate language.\n\nNote: we are running this model quantized at `load_in_4bit` to fit in 16GB of VRAM."
placeholder = "Type Here / ここに入力してください"
examples = [
["What are the best slices of pizza in New York City?"],
["東京でおすすめのラーメン屋ってどこ?"],
['How do I program a simple "hello world" in Python?'],
["Pythonでシンプルな「ハローワールド」をプログラムするにはどうすればいいですか?"],
]
# LLM Settings
# Initial
system_prompt = 'You are a helpful, bilingual assistant. Reply in same language as the user.'
default_prompt = system_prompt
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
# load_in_8bit=True,
load_in_4bit=True,
use_flash_attention_2=True,
)
def chat(message, history, system_prompt):
if not system_prompt:
system_prompt = default_prompt
print('---')
print('Prompt:', system_prompt)
pprint(history)
print(message)
# Let's just rebuild every time it's easier
chat_history = [{"role": "system", "content": system_prompt}]
for h in history:
chat_history.append({"role": "user", "content": h[0]})
chat_history.append({"role": "assistant", "content": h[1]})
chat_history.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
# for multi-gpu, find the device of the first parameter of the model
first_param_device = next(model.parameters()).device
input_ids = input_ids.to(first_param_device)
generate_kwargs = dict(
inputs=input_ids,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
repetition_penalty=1.15,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
)
output_ids = model.generate(**generate_kwargs)
new_tokens = output_ids[0, input_ids.size(1):]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
return response
chat_interface = gr.ChatInterface(
chat,
chatbot=gr.Chatbot(height=400),
textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
title=title,
description=description,
theme="soft",
examples=examples,
cache_examples=False,
undo_btn="Delete Previous",
clear_btn="Clear",
additional_inputs=[
gr.Textbox(system_prompt, label="System Prompt (Change the language of the prompt for better replies)"),
],
)
# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
with gr.Blocks() as demo:
chat_interface.render()
gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")
demo.queue().launch()