xu song commited on
Commit
8607d84
·
1 Parent(s): d48f1cd
Files changed (3) hide show
  1. app.py +33 -9
  2. app_util.py +37 -17
  3. models/cpp_qwen2.py +53 -7
app.py CHANGED
@@ -54,7 +54,7 @@ with gr.Blocks() as demo:
54
  avatar_images=("assets/man.png", "assets/bot.png"))
55
 
56
  with gradio.Tab("Self Chat"):
57
- generated_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
58
  generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
59
  with gr.Row():
60
  retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm", )
@@ -68,7 +68,7 @@ with gr.Blocks() as demo:
68
 
69
  with gradio.Tab("Response Generator"):
70
  with gr.Row():
71
- generated_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
72
  generate_btn_2 = gr.Button("Send", variant="primary")
73
  with gr.Row():
74
  retry_btn_2 = gr.Button("🔄 Regenerate", variant="secondary", size="sm", )
@@ -78,7 +78,7 @@ with gr.Blocks() as demo:
78
 
79
  with gradio.Tab("User Simulator"):
80
  with gr.Row():
81
- generated_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
82
  generate_btn_3 = gr.Button("Send", variant="primary")
83
  with gr.Row():
84
  retry_btn_3 = gr.Button("🔄 Regenerate", variant="secondary", size="sm", )
@@ -116,17 +116,41 @@ with gr.Blocks() as demo:
116
  label="Top-k",
117
  )
118
 
119
- ########
120
  history = gr.State([{"role": "system", "content": system_list[0]}]) # 有用信息只有个system,其他和chatbot内容重叠
121
  system.change(reset_state, inputs=[system], outputs=[chatbot, history])
122
- clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history])
123
 
124
- generate_btn.click(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
 
125
  show_progress="full")
126
- retry_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history]) \
127
- .then(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
 
 
 
 
 
 
 
 
 
 
128
  show_progress="full")
129
- undo_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
132
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
 
54
  avatar_images=("assets/man.png", "assets/bot.png"))
55
 
56
  with gradio.Tab("Self Chat"):
57
+ input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
58
  generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
59
  with gr.Row():
60
  retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm", )
 
68
 
69
  with gradio.Tab("Response Generator"):
70
  with gr.Row():
71
+ input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
72
  generate_btn_2 = gr.Button("Send", variant="primary")
73
  with gr.Row():
74
  retry_btn_2 = gr.Button("🔄 Regenerate", variant="secondary", size="sm", )
 
78
 
79
  with gradio.Tab("User Simulator"):
80
  with gr.Row():
81
+ input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
82
  generate_btn_3 = gr.Button("Send", variant="primary")
83
  with gr.Row():
84
  retry_btn_3 = gr.Button("🔄 Regenerate", variant="secondary", size="sm", )
 
116
  label="Top-k",
117
  )
118
 
 
119
  history = gr.State([{"role": "system", "content": system_list[0]}]) # 有用信息只有个system,其他和chatbot内容重叠
120
  system.change(reset_state, inputs=[system], outputs=[chatbot, history])
 
121
 
122
+ ######## tab1
123
+ generate_btn.click(generate, [chatbot, history], outputs=[chatbot, history],
124
  show_progress="full")
125
+ retry_btn.click(undo_generate, [chatbot, history], outputs=[chatbot, history]) \
126
+ .then(generate, [chatbot, history], outputs=[chatbot, history],
127
+ show_progress="full")
128
+ undo_btn.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
129
+ clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history])
130
+
131
+ ######## tab2
132
+ generate_btn_2.click(append_user, [input_text_2, chatbot, history], outputs=[chatbot, history]) \
133
+ .then(generate_assistant_message, [chatbot, history], outputs=[chatbot, history],
134
+ show_progress="full")
135
+ retry_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history]) \
136
+ .then(generate, [chatbot, history], outputs=[chatbot, history],
137
  show_progress="full")
138
+ undo_btn_2.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
139
+ clear_btn_2.click(reset_state, inputs=[system], outputs=[chatbot, history])\
140
+ .then(reset_user_input, outputs=[input_text_2])
141
+
142
+ ######## tab3
143
+ generate_btn_3.click(append_assistant, [input_text_3, chatbot, history], outputs=[chatbot, history]) \
144
+ .then(generate_assistant_message, [chatbot, history], outputs=[chatbot, history],
145
+ show_progress="full")
146
+ retry_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history]) \
147
+ .then(generate, [chatbot, history], outputs=[chatbot, history],
148
+ show_progress="full")
149
+ undo_btn_3.click(undo_generate, [chatbot, history], outputs=[chatbot, history])
150
+ clear_btn_3.click(reset_state, inputs=[system], outputs=[chatbot, history])\
151
+ .then(reset_user_input, outputs=[input_text_3])
152
+
153
+
154
 
155
  slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
156
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
app_util.py CHANGED
@@ -19,51 +19,51 @@ from models.cpp_qwen2 import bot
19
  # gr.Chatbot.postprocess = postprocess
20
 
21
 
22
- def generate_query(chatbot, history):
23
  if history and history[-1]["role"] == "user":
24
  gr.Warning('You should generate assistant-response.')
25
  yield None, chatbot, history
26
  else:
27
  chatbot.append(None)
28
  streamer = bot.generate(history, stream=True)
29
- for query, query_tokens in streamer:
30
- chatbot[-1] = (query, None)
31
- yield query, chatbot, history
32
- query_tokens = bot.strip_stoptokens(query_tokens)
33
- history.append({"role": "user", "content": query, "tokens": query_tokens})
34
- yield query, chatbot, history
35
 
36
 
37
- def generate_response(chatbot, history):
38
  """
39
  auto-mode:query is None
40
  manual-mode:query 是用户输入
41
  """
42
  logger.info(f"generating {json.dumps(history, ensure_ascii=False)}")
43
- query = history[-1]["content"]
44
  if history[-1]["role"] != "user":
45
  gr.Warning('You should generate or type user-input first.')
46
  yield None, chatbot, history
47
  else:
48
  streamer = bot.generate(history, stream=True)
49
- for response, response_tokens in streamer:
50
- chatbot[-1] = (query, response)
51
- yield response, chatbot, history
52
 
53
- response_tokens = bot.strip_stoptokens(response_tokens)
54
- history.append({"role": "assistant", "content": response, "tokens": response_tokens})
55
  print(f"chatbot is {chatbot}")
56
  print(f"history is {history}")
57
- yield response, chatbot, history
58
 
59
 
60
  def generate(chatbot, history):
61
  logger.info(f"chatbot: {chatbot}; history: {history}")
62
  streamer = None
63
  if history[-1]["role"] in ["assistant", "system"]:
64
- streamer = generate_query(chatbot, history)
65
  elif history[-1]["role"] == "user":
66
- streamer = generate_response(chatbot, history)
67
  else:
68
  gr.Warning("bug")
69
 
@@ -71,6 +71,26 @@ def generate(chatbot, history):
71
  yield out
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def undo_generate(chatbot, history):
75
  if history[-1]["role"] == "user":
76
  history = history[:-1]
 
19
  # gr.Chatbot.postprocess = postprocess
20
 
21
 
22
+ def generate_user_message(chatbot, history):
23
  if history and history[-1]["role"] == "user":
24
  gr.Warning('You should generate assistant-response.')
25
  yield None, chatbot, history
26
  else:
27
  chatbot.append(None)
28
  streamer = bot.generate(history, stream=True)
29
+ for user_content, user_tokens in streamer:
30
+ chatbot[-1] = (user_content, None)
31
+ yield user_content, chatbot, history
32
+ user_tokens = bot.strip_stoptokens(user_tokens)
33
+ history.append({"role": "user", "content": user_content, "tokens": user_tokens})
34
+ yield chatbot, history
35
 
36
 
37
+ def generate_assistant_message(chatbot, history):
38
  """
39
  auto-mode:query is None
40
  manual-mode:query 是用户输入
41
  """
42
  logger.info(f"generating {json.dumps(history, ensure_ascii=False)}")
43
+ user_content = history[-1]["content"]
44
  if history[-1]["role"] != "user":
45
  gr.Warning('You should generate or type user-input first.')
46
  yield None, chatbot, history
47
  else:
48
  streamer = bot.generate(history, stream=True)
49
+ for assistant_content, assistant_tokens in streamer:
50
+ chatbot[-1] = (user_content, assistant_content)
51
+ yield assistant_content, chatbot, history
52
 
53
+ assistant_tokens = bot.strip_stoptokens(assistant_tokens)
54
+ history.append({"role": "assistant", "content": assistant_content, "tokens": assistant_tokens})
55
  print(f"chatbot is {chatbot}")
56
  print(f"history is {history}")
57
+ yield chatbot, history
58
 
59
 
60
  def generate(chatbot, history):
61
  logger.info(f"chatbot: {chatbot}; history: {history}")
62
  streamer = None
63
  if history[-1]["role"] in ["assistant", "system"]:
64
+ streamer = generate_user_message(chatbot, history)
65
  elif history[-1]["role"] == "user":
66
+ streamer = generate_assistant_message(chatbot, history)
67
  else:
68
  gr.Warning("bug")
69
 
 
71
  yield out
72
 
73
 
74
+ def append_user(input_content, chatbot, history):
75
+ if history[-1]["role"] == "user":
76
+ gr.Warning('You should generate assistant-response.')
77
+ return chatbot, history
78
+
79
+ chatbot.append((input_content, None))
80
+ history.append({"role": "user", "content": input_content})
81
+ return chatbot, history
82
+
83
+
84
+ def append_assistant(input_content, chatbot, history):
85
+ if history[-1]["role"] != "user":
86
+ gr.Warning('You should generate or type user-input first.')
87
+ return chatbot, history
88
+ chatbot[-1] = (chatbot[-1][0], input_content)
89
+ history.append({"role": "assistant", "content": input_content})
90
+ return chatbot, history
91
+
92
+
93
+
94
  def undo_generate(chatbot, history):
95
  if history[-1]["role"] == "user":
96
  history = history[:-1]
models/cpp_qwen2.py CHANGED
@@ -1,25 +1,71 @@
1
  """
2
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
3
- https://github.com/awinml/llama-cpp-python-bindings
4
 
5
- python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
6
 
7
- python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
8
 
 
9
 
 
10
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
11
-
12
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
13
-
14
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ## reference
18
 
 
 
19
  - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
20
  - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
21
  - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
22
-
23
  """
24
 
25
  import json
 
1
  """
 
 
2
 
 
3
 
4
+ ## convert to gguf
5
 
6
+ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
7
 
8
+ ## predict
9
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
 
10
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
 
11
  ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
12
 
13
 
14
+ ## timing
15
+
16
+
17
+ **重庆GPU服务器,cache为空 **
18
+ llama_print_timings: load time = 1711.48 ms
19
+ llama_print_timings: sample time = 214.87 ms / 122 runs ( 1.76 ms per token, 567.78 tokens per second)
20
+ llama_print_timings: prompt eval time = 892.14 ms / 5 tokens ( 178.43 ms per token, 5.60 tokens per second)
21
+ llama_print_timings: eval time = 4277.26 ms / 121 runs ( 35.35 ms per token, 28.29 tokens per second)
22
+ llama_print_timings: total time = 8351.28 ms / 126 tokens
23
+
24
+ llama_print_timings: load time = 1711.48 ms
25
+ llama_print_timings: sample time = 45.11 ms / 25 runs ( 1.80 ms per token, 554.24 tokens per second)
26
+ llama_print_timings: prompt eval time = 1059.46 ms / 5 tokens ( 211.89 ms per token, 4.72 tokens per second)
27
+ llama_print_timings: eval time = 843.71 ms / 24 runs ( 35.15 ms per token, 28.45 tokens per second)
28
+ llama_print_timings: total time = 2501.50 ms / 29 tokens
29
+
30
+
31
+ llama_print_timings: load time = 1711.48 ms
32
+ llama_print_timings: sample time = 227.75 ms / 125 runs ( 1.82 ms per token, 548.85 tokens per second)
33
+ llama_print_timings: prompt eval time = 2056.86 ms / 5 tokens ( 411.37 ms per token, 2.43 tokens per second)
34
+ llama_print_timings: eval time = 4657.86 ms / 124 runs ( 37.56 ms per token, 26.62 tokens per second)
35
+ llama_print_timings: total time = 9532.50 ms / 129 tokens
36
+
37
+ llama_print_timings: load time = 1711.48 ms
38
+ llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
39
+ llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.5秒/token
40
+ llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
41
+ llama_print_timings: total time = 4848.09 ms / 45 tokens
42
+
43
+ **hf-space,cache为空 ** -----------
44
+ llama_print_timings: load time = 28230.06 ms
45
+ llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second)
46
+ llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.7秒/token
47
+ llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
48
+ llama_print_timings: total time = 30753.48 ms / 12 tokens
49
+
50
+ llama_print_timings: load time = 28230.06 ms
51
+ llama_print_timings: sample time = 74.34 ms / 61 runs ( 1.22 ms per token, 820.52 tokens per second)
52
+ llama_print_timings: prompt eval time = 28821.26 ms / 9 tokens ( 3202.36 ms per token, 0.31 tokens per second)
53
+ llama_print_timings: eval time = 21634.71 ms / 60 runs ( 360.58 ms per token, 2.77 tokens per second)
54
+ llama_print_timings: total time = 51255.55 ms / 69 tokens
55
+
56
+ llama_print_timings: load time = 28230.06 ms
57
+ llama_print_timings: sample time = 98.03 ms / 68 runs ( 1.44 ms per token, 693.66 tokens per second)
58
+ llama_print_timings: prompt eval time = 27749.35 ms / 5 tokens ( 5549.87 ms per token, 0.18 tokens per second)
59
+ llama_print_timings: eval time = 26998.58 ms / 67 runs ( 402.96 ms per token, 2.48 tokens per second)
60
+ llama_print_timings: total time = 56335.37 ms / 72 tokens
61
+
62
  ## reference
63
 
64
+ - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
65
+ - https://github.com/awinml/llama-cpp-python-bindings
66
  - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
67
  - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
68
  - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
 
69
  """
70
 
71
  import json