Tobias Bergmann
commited on
Commit
·
0b2f919
1
Parent(s):
fa8a778
tps
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ from llama_cpp import Llama
|
|
2 |
from huggingface_hub import hf_hub_download
|
3 |
import gradio as gr
|
4 |
from typing import Tuple, List
|
|
|
5 |
|
6 |
DESCRIPTION = f"""
|
7 |
# Chat with Arco 500M as GGUF on CPU
|
@@ -38,18 +39,29 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
|
|
38 |
|
39 |
# This will produce a generator of output chunks
|
40 |
stream = pipe(
|
41 |
-
prompt,
|
42 |
max_tokens=max_new_tokens,
|
43 |
stop=["</s>"],
|
44 |
stream=True
|
45 |
)
|
46 |
|
|
|
|
|
|
|
47 |
# Send each token stream output to the user
|
48 |
for output in stream:
|
49 |
new_text = output['choices'][0]['text']
|
50 |
reply += new_text
|
|
|
51 |
history[-1][1] = reply # Update the current reply in history
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
with gr.Blocks() as demo:
|
@@ -65,4 +77,4 @@ with gr.Blocks() as demo:
|
|
65 |
)
|
66 |
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
|
67 |
|
68 |
-
demo.queue().launch()
|
|
|
2 |
from huggingface_hub import hf_hub_download
|
3 |
import gradio as gr
|
4 |
from typing import Tuple, List
|
5 |
+
import time
|
6 |
|
7 |
DESCRIPTION = f"""
|
8 |
# Chat with Arco 500M as GGUF on CPU
|
|
|
39 |
|
40 |
# This will produce a generator of output chunks
|
41 |
stream = pipe(
|
42 |
+
prompt,
|
43 |
max_tokens=max_new_tokens,
|
44 |
stop=["</s>"],
|
45 |
stream=True
|
46 |
)
|
47 |
|
48 |
+
start_time = time.time()
|
49 |
+
tokens_generated = 0
|
50 |
+
|
51 |
# Send each token stream output to the user
|
52 |
for output in stream:
|
53 |
new_text = output['choices'][0]['text']
|
54 |
reply += new_text
|
55 |
+
tokens_generated += len(new_text.split()) # Simple token counting by splitting on whitespace
|
56 |
history[-1][1] = reply # Update the current reply in history
|
57 |
+
|
58 |
+
elapsed_time = time.time() - start_time
|
59 |
+
if elapsed_time > 0:
|
60 |
+
tokens_per_second = tokens_generated / elapsed_time
|
61 |
+
else:
|
62 |
+
tokens_per_second = 0
|
63 |
+
|
64 |
+
yield f"{reply} \n\n *Tokens/second: {tokens_per_second:.2f}*", history
|
65 |
|
66 |
|
67 |
with gr.Blocks() as demo:
|
|
|
77 |
)
|
78 |
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
|
79 |
|
80 |
+
demo.queue().launch()
|