File size: 3,893 Bytes
1094afa
 
 
c47b001
c500491
2c98ceb
e181201
2c98ceb
e181201
 
18b76ff
7346e4c
e181201
 
 
 
b33ad9c
dac7390
1094afa
 
5d03b4e
1094afa
e181201
d6b04bd
1094afa
 
 
89fb97d
 
1094afa
 
 
 
 
 
89fb97d
1094afa
 
5a8970f
1094afa
 
89fb97d
1094afa
 
 
 
 
 
89fb97d
 
 
 
00ff2b2
89fb97d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1094afa
09f21bc
 
 
4424032
09f21bc
 
61a8a4b
1094afa
89fb97d
 
 
1094afa
ea53642
1094afa
ea53642
5c6382b
89fb97d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import gradio as gr

desired_dtype = torch.bfloat16
torch.set_default_dtype(torch.bfloat16)

# checkpoint = "vsrinivas/falconlite2"
checkpoint = "tiiuae/falcon-7b-instruct"

model = AutoModelForCausalLM.from_pretrained(
    # checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto")
    checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True)

# tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

def format_chat_prompt(message, chat_history, instruction):
    prompt = f"System:{instruction}"
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None):
    output = pipeline(prompt,
      max_length=200,
      truncation=True,
      max_new_tokens = max_new_tokens,
      stop_sequence = stop_sequence,
      temperature=temperature,
      do_sample=True,
      top_k=10,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id)
    return output[0]['generated_text']

def respond(message, chat_history, instruction, temperature=0.7):
    prompt = format_chat_prompt(message, chat_history, instruction)
    chat_history = chat_history + [[message, ""]]
    stream = generate_seqs(prompt = prompt,
                                    max_new_tokens=8192,
                                    stop_sequence=["\nUser:", "<|endoftext|>"],
                                    temperature=temperature).split('Assistant: ')[-1]
                                    #stop_sequence to not generate the user answer
    acc_text = ""
    #Streaming the tokens
    for idx, response in enumerate(stream):
            # text_token = response.token.text
            text_token = response

            # if response.details:
            #     return

            if idx == 0 and text_token.startswith(" "):
                text_token = text_token[1:]

            acc_text += text_token
            last_turn = list(chat_history.pop(-1))
            last_turn[-1] += acc_text
            chat_history = chat_history + [last_turn]
            yield "", chat_history
            acc_text = ""

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # General purpose chatbot - test & demo app by Srinivas.V..
    ## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt.
    """)
    
    chatbot = gr.Chatbot(height=500) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options",open=False):
        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
gr.close_all()
demo.queue().launch()