Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import transformers | |
import torch | |
import gradio as gr | |
desired_dtype = torch.bfloat16 | |
torch.set_default_dtype(torch.bfloat16) | |
# checkpoint = "vsrinivas/falconlite2" | |
checkpoint = "tiiuae/falcon-7b-instruct" | |
model = AutoModelForCausalLM.from_pretrained( | |
# checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto") | |
checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True) | |
# tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto") | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True, | |
device_map="auto", | |
) | |
def format_chat_prompt(message, chat_history, instruction): | |
prompt = f"System:{instruction}" | |
for turn in chat_history: | |
user_message, bot_message = turn | |
prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}" | |
prompt = f"{prompt}\nUser: {message}\nAssistant:" | |
return prompt | |
def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None): | |
output = pipeline(prompt, | |
max_length=200, | |
truncation=True, | |
max_new_tokens = max_new_tokens, | |
stop_sequence = stop_sequence, | |
temperature=temperature, | |
do_sample=True, | |
top_k=10, | |
num_return_sequences=1, | |
eos_token_id=tokenizer.eos_token_id) | |
return output[0]['generated_text'] | |
def respond(message, chat_history, instruction, temperature=0.7): | |
prompt = format_chat_prompt(message, chat_history, instruction) | |
chat_history = chat_history + [[message, ""]] | |
stream = generate_seqs(prompt = prompt, | |
max_new_tokens=8192, | |
stop_sequence=["\nUser:", "<|endoftext|>"], | |
temperature=temperature).split('Assistant: ')[-1] | |
#stop_sequence to not generate the user answer | |
acc_text = "" | |
#Streaming the tokens | |
for idx, response in enumerate(stream): | |
# text_token = response.token.text | |
text_token = response | |
# if response.details: | |
# return | |
if idx == 0 and text_token.startswith(" "): | |
text_token = text_token[1:] | |
acc_text += text_token | |
last_turn = list(chat_history.pop(-1)) | |
last_turn[-1] += acc_text | |
chat_history = chat_history + [last_turn] | |
yield "", chat_history | |
acc_text = "" | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# General purpose chatbot - test & demo app by Srinivas.V.. | |
## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt. | |
""") | |
chatbot = gr.Chatbot(height=1000) #just to fit the notebook | |
msg = gr.Textbox(label="Prompt") | |
with gr.Accordion(label="Advanced options",open=False): | |
system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.") | |
temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1) | |
btn = gr.Button("Submit") | |
clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console") | |
btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) | |
msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) | |
gr.close_all() | |
demo.queue().launch() |