Spaces:

Neon-tech
/

Test

Paused

Neon-tech commited on Apr 8

Commit

fd8889f

verified ·

1 Parent(s): 94cc835

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,9 +25,20 @@ def chat(message, history):
     text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    outputs = model.generate(**inputs, max_new_tokens=512)
-    output = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-    return output
 with gr.Blocks() as demo:
     stats = gr.Textbox(label="System Stats", value=get_stats, every=5)

     text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(**inputs, max_new_tokens=512, streamer=streamer)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    output = ""
+    for token in streamer:
+        output += token
+        yield output
 with gr.Blocks() as demo:
     stats = gr.Textbox(label="System Stats", value=get_stats, every=5)