Tobias Bergmann commited on
Commit
0b2f919
·
1 Parent(s): fa8a778
Files changed (1) hide show
  1. app.py +15 -3
app.py CHANGED
@@ -2,6 +2,7 @@ from llama_cpp import Llama
2
  from huggingface_hub import hf_hub_download
3
  import gradio as gr
4
  from typing import Tuple, List
 
5
 
6
  DESCRIPTION = f"""
7
  # Chat with Arco 500M as GGUF on CPU
@@ -38,18 +39,29 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
38
 
39
  # This will produce a generator of output chunks
40
  stream = pipe(
41
- prompt,
42
  max_tokens=max_new_tokens,
43
  stop=["</s>"],
44
  stream=True
45
  )
46
 
 
 
 
47
  # Send each token stream output to the user
48
  for output in stream:
49
  new_text = output['choices'][0]['text']
50
  reply += new_text
 
51
  history[-1][1] = reply # Update the current reply in history
52
- yield "", history
 
 
 
 
 
 
 
53
 
54
 
55
  with gr.Blocks() as demo:
@@ -65,4 +77,4 @@ with gr.Blocks() as demo:
65
  )
66
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
67
 
68
- demo.queue().launch()
 
2
  from huggingface_hub import hf_hub_download
3
  import gradio as gr
4
  from typing import Tuple, List
5
+ import time
6
 
7
  DESCRIPTION = f"""
8
  # Chat with Arco 500M as GGUF on CPU
 
39
 
40
  # This will produce a generator of output chunks
41
  stream = pipe(
42
+ prompt,
43
  max_tokens=max_new_tokens,
44
  stop=["</s>"],
45
  stream=True
46
  )
47
 
48
+ start_time = time.time()
49
+ tokens_generated = 0
50
+
51
  # Send each token stream output to the user
52
  for output in stream:
53
  new_text = output['choices'][0]['text']
54
  reply += new_text
55
+ tokens_generated += len(new_text.split()) # Simple token counting by splitting on whitespace
56
  history[-1][1] = reply # Update the current reply in history
57
+
58
+ elapsed_time = time.time() - start_time
59
+ if elapsed_time > 0:
60
+ tokens_per_second = tokens_generated / elapsed_time
61
+ else:
62
+ tokens_per_second = 0
63
+
64
+ yield f"{reply} \n\n *Tokens/second: {tokens_per_second:.2f}*", history
65
 
66
 
67
  with gr.Blocks() as demo:
 
77
  )
78
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
79
 
80
+ demo.queue().launch()