deepthought_8B_gguf_inference

Sleeping

Tobias Bergmann commited on Dec 14, 2024

Commit

0b2f919

1 Parent(s): fa8a778

tps

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import gradio as gr
 from typing import Tuple, List
 DESCRIPTION = f"""
 # Chat with Arco 500M as GGUF on CPU
@@ -38,18 +39,29 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
     # This will produce a generator of output chunks
     stream = pipe(
-        prompt,
         max_tokens=max_new_tokens,
         stop=["</s>"],
         stream=True
     )
     # Send each token stream output to the user
     for output in stream:
         new_text = output['choices'][0]['text']
         reply += new_text
         history[-1][1] = reply # Update the current reply in history
-        yield "", history
 with gr.Blocks() as demo:
@@ -65,4 +77,4 @@ with gr.Blocks() as demo:
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
-demo.queue().launch()

 from huggingface_hub import hf_hub_download
 import gradio as gr
 from typing import Tuple, List
+import time
 DESCRIPTION = f"""
 # Chat with Arco 500M as GGUF on CPU
     # This will produce a generator of output chunks
     stream = pipe(
+        prompt,
         max_tokens=max_new_tokens,
         stop=["</s>"],
         stream=True
     )
+    start_time = time.time()
+    tokens_generated = 0
     # Send each token stream output to the user
     for output in stream:
         new_text = output['choices'][0]['text']
         reply += new_text
+        tokens_generated += len(new_text.split()) # Simple token counting by splitting on whitespace
         history[-1][1] = reply # Update the current reply in history
+        elapsed_time = time.time() - start_time
+        if elapsed_time > 0:
+            tokens_per_second = tokens_generated / elapsed_time
+        else:
+            tokens_per_second = 0
+        yield f"{reply} \n\n *Tokens/second: {tokens_per_second:.2f}*", history
 with gr.Blocks() as demo:
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
+demo.queue().launch()