Spaces:

kingabzpro
/

falcon-1b-ChatBot

Runtime error

File size: 2,192 Bytes

5e303ca
 
fe0ac3f
 
5e303ca
 
 
9e61cec
5e303ca
 
 
fe0ac3f
9e61cec
 
 
fe0ac3f
9e61cec
 
5e303ca
fe0ac3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e6af97
9e61cec
 
c350f94

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from threading import Thread


title = "🦅Falcon 🗨️ChatBot"
description = "Falcon-RW-1B is a 1B parameters causal decoder-only model built by TII and trained on 350B tokens of RefinedWeb."
examples = [["How are you?"]]


tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b",torch_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-rw-1b",
    trust_remote_code=True,
    torch_dtype=torch.float16
)


class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [29, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

def predict(message, history): 

    history_transformer_format = history + [[message, ""]]
    stop = StopOnTokens()

    messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]])  #curr_system_message + 
                for item in history_transformer_format])
    
    model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        top_k=1000,
        temperature=1.0,
        num_beams=1,
        stopping_criteria=StoppingCriteriaList([stop])
        )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    partial_message  = ""
    for new_token in streamer:
        if new_token != '<':
            partial_message += new_token
            yield partial_message 
            

gr.ChatInterface(predict,
    title=title,
    description=description,
    examples=examples,
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Chat with me")).queue().launch()