File size: 8,642 Bytes
f3f39f6
 
 
8f180e3
 
 
d9a4882
cbcd367
d9a4882
 
 
 
f3f39f6
 
 
8f180e3
e646450
d9a4882
 
 
cbcd367
 
 
 
8f180e3
 
f3f39f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bf8936
f3f39f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import os 
from typing import Iterator
import gradio as gr 
from text_generation import Client  # Assuming you have a text_generation module
from transformers import AutoModel

# Set Hugging Face API token from environment variable
HF_TOKEN = os.environ.get('chatbot', False)

if not HF_TOKEN:
    raise ValueError("Hugging Face API token is not set. Set the HF_READ_TOKEN environment variable.")

EOS_STRING = '</s>'
EOT_STRING = '<EOT>'

# Load the private model with access token
access_token = os.environ.get('chatbot', False)
if not access_token:
    raise ValueError("Hugging Face model access token is not set. Set the HF_MODEL_ACCESS_TOKEN environment variable.")

# Set protected namespaces to resolve the warning
model_config = AutoModel.config
model_config['protected_namespaces'] = ()

model = AutoModel.from_pretrained("private/model", token=access_token)

def get_prompt(message, chat_history, system_prompt):
    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']

    do_strip = False
    for user_input, response in chat_history:
        user_input = user_input.strip() if do_strip else user_input
        do_strip = True
        texts.append(f"{user_input} [/INST\ {response.strip()} </s><s>[INST] ")
    message = message.strip() if do_strip else message
    texts.append(f"{message} [/INST]")
    return ''.join(texts)

def run(model_id, message, chat_history, system_prompt, max_new_tokens=1024, temperature=0.3, top_p=0.9, top_k=50):
    API_URL = "https://api-inference.huggingface.co/models/" + model_id
    client = Client(API_URL, headers={'Authorization': f"Bearer {HF_TOKEN}"})
    prompt = get_prompt(message, chat_history, system_prompt)
    generate_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature
        
    )
    stream = client.generate_stream(prompt, **generate_kwargs)
    output = ''
    for response in stream:
        if any([end_token in response.token.text for end_token in [EOS_STRING, EOT_STRING]]):
            return output
        else:
            output += response.token.text
        yield output
    return output

DEFAULT_SYSTEM_PROMPT = """
    You are Jarvis. You are an AI assistant, you are moderately-polite and give only true information.
    You carefully provide accurate, factual, thoughtful, nuanced answers, and are brilliant at reasoning. 
    If you think there might not be a correct answer, you say so. Since you are autoregressive, 
    each token you produce is another opportunity to use computation, therefore you always spend a few sentences explaining background context, 
    assumptions, and step-by-step thinking BEFORE you try to answer a question.
"""
MAX_MAX_NEW_TOKENS = 10240
DEFAULT_MAX_NEW_TOKENS = 4096
MAX_INPUT_TOKEN_LENGTH = 4000

DESCRIPTION = "# <h1>He's just Jarvis. ;)</h1>"

def clear_and_save_textbox(message): return '', message

def display_input(message, history=[]):
    history.append((message, ''))
    return history

def delete_prev_fn(history=[]):
    try:
        message, _ = history.pop()
    except IndexError:
        message = ''
    return history, message or ''

def generate(model_id, message, history_with_input, system_prompt, max_new_tokens, temperature, top_p, top_k):
    if max_new_tokens > MAX_MAX_NEW_TOKENS:
        raise ValueError

    history = history_with_input[:-1]
    generator = run(model_id, message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)

    try:
        first_response = next(generator)
        yield history + [(message, first_response)]
    except StopIteration:
        yield history + [(message, '')]
    for response in generator:
        yield history + [(message, response)]

def process_example(model_id, message):
    generator = generate(model_id, message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
    for x in generator:
        pass
    return '', x

def check_input_token_length(message, chat_history, system_prompt):
    input_token_length = len(message) + len(chat_history)
    if input_token_length > MAX_INPUT_TOKEN_LENGTH:
        raise gr.Error(f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Client your chat history and try again.")

with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Group():
        chatbot = gr.Chatbot(label='Jarvis')
        with gr.Row():
            textbox = gr.Textbox(container=False, show_label=False, placeholder='Hey, Jarvis', scale=7)
            model_id = gr.Dropdown(label='LLM',
                                   choices=[
                                       'mistralai/Mistral-7B-Instruct-v0.1', 
                                       'HuggingFaceH4/zephyr-7b-beta', 
                                       'meta-llama/Llama-2-7b-chat-hf'
                                   ],
                                  value='mistralai/Mistral-7B-Instruct-v0.1', scale=3)
            submit_button = gr.Button('Submit', variant='primary', scale=1, min_width=0)

        with gr.Row():
            retry_button = gr.Button('Retry', variant='secondary')
            undo_button = gr.Button('Undo', variant='secondary')
            clear_button = gr.Button('Clear', variant='secondary')

        saved_input = gr.State()

        with gr.Accordion(label='Advanced Options', open=False):
            system_prompt = gr.Textbox(label='System prompt', value=DEFAULT_SYSTEM_PROMPT, lines=5, interactive=False)
            max_new_tokens = gr.Slider(label='Max New Tokens', minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
            temperature = gr.Slider(label='Temperatur', minimum=0.1, maximum=4.0, step=0.1, value=0.1)
            top_p = gr.Slider(label='Top-P (nucleus sampling)', minimum=0.05, maximum=1.0, step=0.05, value=0.9)
            top_k = gr.Slider(label='Top-K', minimum=1, maximum=1000, step=1, value=10)

    textbox.submit(
        fn=clear_and_save_textbox,
        inputs=textbox,
        outputs=[textbox, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=check_input_token_length,
        inputs=[saved_input, chatbot, system_prompt],
        api_name=False,
        queue=False,
    ).success(
        fn=generate,
        inputs=[
            model_id,
            saved_input,
            chatbot,
            system_prompt,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name=False,
    )

    button_event_preprocess = submit_button.click(
        fn=clear_and_save_textbox,
        inputs=textbox,
        outputs=[textbox, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=check_input_token_length,
        inputs=[saved_input, chatbot, system_prompt],
        api_name=False,
        queue=False,
    ).success(
        fn=generate,
        inputs=[
            model_id,
            saved_input,
            chatbot,
            system_prompt,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name=False,
    )

    retry_button.click(
        fn=delete_prev_fn,
        inputs=chatbot,
        outputs=[chatbot, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=display_input,
        inputs=[saved_input, chatbot],
        outputs=chatbot,
        api_name=False,
        queue=False,
    ).then(
        fn=generate,
        inputs=[
            model_id,
            saved_input,
            chatbot,
            system_prompt,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=chatbot,
        api_name=False,
    )

    undo_button.click(
        fn=delete_prev_fn,
        inputs=chatbot,
        outputs=[chatbot, saved_input],
        api_name=False,
        queue=False,
    ).then(
        fn=lambda x: x,
        inputs=[saved_input],
        outputs=textbox,
        api_name=False,
        queue=False,
    )

    clear_button.click(
        fn=lambda: ([], ''),
        outputs=[chatbot, saved_input],
        queue=False,
        api_name=False,
    )

demo.queue(max_size=32).launch(show_api=False)