Spaces:
Sleeping
Sleeping
File size: 3,893 Bytes
1094afa c47b001 c500491 2c98ceb e181201 2c98ceb e181201 18b76ff 7346e4c e181201 b33ad9c dac7390 1094afa 5d03b4e 1094afa e181201 d6b04bd 1094afa 89fb97d 1094afa 89fb97d 1094afa 5a8970f 1094afa 89fb97d 1094afa 89fb97d 00ff2b2 89fb97d 1094afa 09f21bc 4424032 09f21bc 61a8a4b 1094afa 89fb97d 1094afa ea53642 1094afa ea53642 5c6382b 89fb97d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import gradio as gr
desired_dtype = torch.bfloat16
torch.set_default_dtype(torch.bfloat16)
# checkpoint = "vsrinivas/falconlite2"
checkpoint = "tiiuae/falcon-7b-instruct"
model = AutoModelForCausalLM.from_pretrained(
# checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto")
checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
def format_chat_prompt(message, chat_history, instruction):
prompt = f"System:{instruction}"
for turn in chat_history:
user_message, bot_message = turn
prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
prompt = f"{prompt}\nUser: {message}\nAssistant:"
return prompt
def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None):
output = pipeline(prompt,
max_length=200,
truncation=True,
max_new_tokens = max_new_tokens,
stop_sequence = stop_sequence,
temperature=temperature,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id)
return output[0]['generated_text']
def respond(message, chat_history, instruction, temperature=0.7):
prompt = format_chat_prompt(message, chat_history, instruction)
chat_history = chat_history + [[message, ""]]
stream = generate_seqs(prompt = prompt,
max_new_tokens=8192,
stop_sequence=["\nUser:", "<|endoftext|>"],
temperature=temperature).split('Assistant: ')[-1]
#stop_sequence to not generate the user answer
acc_text = ""
#Streaming the tokens
for idx, response in enumerate(stream):
# text_token = response.token.text
text_token = response
# if response.details:
# return
if idx == 0 and text_token.startswith(" "):
text_token = text_token[1:]
acc_text += text_token
last_turn = list(chat_history.pop(-1))
last_turn[-1] += acc_text
chat_history = chat_history + [last_turn]
yield "", chat_history
acc_text = ""
with gr.Blocks() as demo:
gr.Markdown(
"""
# General purpose chatbot - test & demo app by Srinivas.V..
## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt.
""")
chatbot = gr.Chatbot(height=500) #just to fit the notebook
msg = gr.Textbox(label="Prompt")
with gr.Accordion(label="Advanced options",open=False):
system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
btn = gr.Button("Submit")
clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console")
btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
gr.close_all()
demo.queue().launch() |