deepthought_8B_gguf_inference

Sleeping

Tobias Bergmann commited on Dec 14, 2024

Commit

668ee0d

1 Parent(s): 3a8892f

streaming

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,34 +29,28 @@ pipe = Llama(
 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
         return "", history
     prompt = message
-    # Initialize reply
-    reply = ""
     history.append([message, ""])
-    # Use stream=True for streaming
     stream = pipe(
         prompt,
-        max_tokens=max_new_tokens,
         stop=["</s>"],
         stream=True
     )
     for output in stream:
-        # This loop will receive partial output (one token at a time)
         new_text = output['choices'][0]['text']
-        # Append to the current reply
-        reply += new_text
-        # Update the history
-        history[-1][1] = reply
-        # Yield for incremental display on chat
-        yield "", history
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.Chatbot()
@@ -69,5 +63,5 @@ with gr.Blocks() as demo:
         label="Max New Tokens",
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
 demo.queue().launch()

 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
         return "", history
     prompt = message
     history.append([message, ""])
+    # Initialize reply for this round
+    reply = ""
+    # This will produce a generator of output chunks
     stream = pipe(
         prompt,
+        max_tokens=max_new_tokens,
         stop=["</s>"],
         stream=True
     )
     for output in stream:
         new_text = output['choices'][0]['text']
+        reply += new_text
+        history[-1][1] = reply  # Update the current reply in history
+        yield "", history
+    return "", history  # Always return at the end to terminate the generator
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.Chatbot()
         label="Max New Tokens",
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
 demo.queue().launch()