xu song commited on
Commit
8988bbf
1 Parent(s): b597747
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.39.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -2,6 +2,10 @@
2
  来自 https://github.com/OpenLMLab/MOSS/blob/main/moss_web_demo_gradio.py
3
 
4
 
 
 
 
 
5
  # 单卡报错
6
  python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
7
 
@@ -9,6 +13,11 @@ python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
9
  - 第一句:
10
  - 代码和表格的预览
11
  - 可编辑chatbot:https://github.com/gradio-app/gradio/issues/4444
 
 
 
 
 
12
  """
13
 
14
  from transformers.generation.utils import logger
@@ -18,13 +27,9 @@ import argparse
18
  import warnings
19
  import torch
20
  import os
21
- # from moss_util import generate_query
22
- from models.qwen2_util import bot
23
- # generate_query = None
24
-
25
- # gr.ChatInterface
26
 
27
- # from gpt35 import build_message_for_gpt35, send_one_query
28
 
29
  #
30
  # def postprocess(self, y):
@@ -75,61 +80,87 @@ def parse_text(text):
75
 
76
 
77
  def generate_query(chatbot, history):
78
- if history and history[-1][1] is None: # 该生成response了
 
79
  return None, chatbot, history
80
  query = bot.generate_query(history)
81
  # chatbot.append((query, ""))
82
  chatbot.append((query, None))
83
- history = history + [(query, None)]
84
  return query, chatbot, history
85
 
 
86
  def generate_response(query, chatbot, history):
87
  """
88
- 自动模式下:query is None,或者 query = history[-1][0]
89
- 人工模式下:query 是任意值
90
  :param query:
91
  :param chatbot:
92
  :param history:
93
  :return:
94
  """
95
- # messages = build_message_for_gpt35(query, history)
96
- # response, success = send_one_query(query, messages, model="gpt-35-turbo")
97
- # response = response["choices"][0]["message"]["content"]
98
 
99
- #
100
- if history[-1][1] is not None or chatbot[-1][1] is not None:
101
  return chatbot, history
102
 
103
- if query is None:
104
- query = history[-1][0]
105
- response = bot.generate_response(query, history[:-1])
106
- # chatbot.append((query, response))
107
- history[-1] = (query, response)
108
  chatbot[-1] = (query, response)
 
109
  print(f"chatbot is {chatbot}")
110
  print(f"history is {history}")
111
  return chatbot, history
112
 
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def reset_user_input():
115
  return gr.update(value='')
116
 
117
 
118
- def reset_state():
119
- return [], []
 
120
 
 
 
 
 
 
 
 
121
 
122
  """
123
  TODO: 使用说明
124
-
125
- avatar_images
126
  """
127
  with gr.Blocks() as demo:
128
- gr.HTML("""<h1 align="center">欢迎使用 self chat 人工智能助手!</h1>""")
129
-
130
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
131
- system = gr.Textbox(show_label=False, placeholder="You are a helpful assistant.")
132
- chatbot = gr.Chatbot(avatar_images=("assets/profile.png", "assets/bot.png"))
 
 
 
 
 
133
  with gr.Row():
134
  with gr.Column(scale=4):
135
  user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10)
@@ -156,18 +187,22 @@ with gr.Blocks() as demo:
156
  # info="Will add more animals later!"
157
  ),
158
 
159
- history = gr.State([]) # (message, bot_message)
 
 
160
 
161
  submit_btn.click(generate_response, [user_input, chatbot, history], [chatbot, history],
162
- show_progress=True)
163
  # submit_btn.click(reset_user_input, [], [user_input])
164
 
165
- clear_btn.click(reset_state, outputs=[chatbot, history], show_progress=True)
166
 
167
- generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history], show_progress=True)
 
168
 
169
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
170
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 
171
  gr.Slider(
172
  minimum=0.1,
173
  maximum=1.0,
 
2
  来自 https://github.com/OpenLMLab/MOSS/blob/main/moss_web_demo_gradio.py
3
 
4
 
5
+ # 难点
6
+
7
+
8
+
9
  # 单卡报错
10
  python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
11
 
 
13
  - 第一句:
14
  - 代码和表格的预览
15
  - 可编辑chatbot:https://github.com/gradio-app/gradio/issues/4444
16
+ - 一个button,
17
+
18
+ ## Reference
19
+
20
+ -
21
  """
22
 
23
  from transformers.generation.utils import logger
 
27
  import warnings
28
  import torch
29
  import os
30
+ # from models.hf_qwen2 import bot
31
+ from models.cpp_qwen2 import bot
 
 
 
32
 
 
33
 
34
  #
35
  # def postprocess(self, y):
 
80
 
81
 
82
  def generate_query(chatbot, history):
83
+ if history and history[-1]["role"] == "user": # 该生成response了
84
+ gr.Warning('You should generate assistant-response.')
85
  return None, chatbot, history
86
  query = bot.generate_query(history)
87
  # chatbot.append((query, ""))
88
  chatbot.append((query, None))
89
+ history.append({"role": "user", "content": query})
90
  return query, chatbot, history
91
 
92
+
93
  def generate_response(query, chatbot, history):
94
  """
95
+ 自动模式下:query is None
96
+ 人工模式下:query 是用户输入
97
  :param query:
98
  :param chatbot:
99
  :param history:
100
  :return:
101
  """
102
+ if query and history[-1]["role"] != "user":
103
+ history.append({"role": "user", "content": query})
 
104
 
105
+ if history[-1]["role"] != "user":
106
+ gr.Warning('You should generate or type user-input first.')
107
  return chatbot, history
108
 
109
+ response = bot.generate_response(history)
110
+ query = history[-1]["content"]
 
 
 
111
  chatbot[-1] = (query, response)
112
+ history.append({"role": "assistant", "content": response})
113
  print(f"chatbot is {chatbot}")
114
  print(f"history is {history}")
115
  return chatbot, history
116
 
117
 
118
+ def generate():
119
+ """
120
+
121
+ :return:
122
+ """
123
+ pass
124
+
125
+
126
+ def regenerate():
127
+ """
128
+ 删除上一轮,重新生成。
129
+ :return:
130
+ """
131
+ pass
132
+
133
+
134
  def reset_user_input():
135
  return gr.update(value='')
136
 
137
 
138
+ def reset_state(system):
139
+ return [], [{"role": "system", "content": system}]
140
+
141
 
142
+ system_list = [
143
+ "You are a helpful assistant.",
144
+ "你是一个导游。",
145
+ "你是一个英语老师。",
146
+ "你是一个程序员。",
147
+ "你是一个心理咨询师。",
148
+ ]
149
 
150
  """
151
  TODO: 使用说明
 
 
152
  """
153
  with gr.Blocks() as demo:
154
+ # Knowledge Distillation through Self Chatting
155
+ gr.HTML("""<h1 align="center">Distilling the Knowledge through Self Chatting</h1>""")
156
+ system = gr.Dropdown(
157
+ choices=system_list,
158
+ value=system_list[0],
159
+ allow_custom_value=True,
160
+ interactive=True,
161
+ label="System message"
162
+ )
163
+ chatbot = gr.Chatbot(avatar_images=("assets/man.png", "assets/bot.png"))
164
  with gr.Row():
165
  with gr.Column(scale=4):
166
  user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10)
 
187
  # info="Will add more animals later!"
188
  ),
189
 
190
+ history = gr.State([{"role": "system", "content": system_list[0]}])
191
+
192
+ system.change(reset_state, inputs=[system], outputs=[chatbot, history], show_progress="full")
193
 
194
  submit_btn.click(generate_response, [user_input, chatbot, history], [chatbot, history],
195
+ show_progress="full")
196
  # submit_btn.click(reset_user_input, [], [user_input])
197
 
198
+ clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history], show_progress="full")
199
 
200
+ generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history],
201
+ show_progress="full")
202
 
203
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
204
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature",
205
+ info="Larger temperature increase the randomness"),
206
  gr.Slider(
207
  minimum=0.1,
208
  maximum=1.0,
models/cpp_qwen2.py CHANGED
@@ -1,36 +1,49 @@
1
  """
2
  https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
3
  https://github.com/awinml/llama-cpp-python-bindings
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  from simulator import Simulator
7
- from llama_cpp import Llama
8
- import llama_cpp.llama_tokenizer
9
  from transformers import AutoTokenizer
10
 
11
 
12
  class Qwen2Simulator(Simulator):
13
 
14
  def __init__(self, model_name_or_path=None):
15
- # self.llm = llama_cpp.Llama.from_pretrained(
16
- # repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
17
- # filename="*q8_0.gguf", #
18
- # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
19
- # "Qwen/Qwen1.5-0.5B-Chat"
20
- # ),
21
- # verbose=False,
22
- # )
23
-
24
- self.hf_tokenizer = AutoTokenizer.from_pretrained("/workspace/czy/model_weights/Qwen1.5-0.5B-Chat/")
25
- self.llm = Llama(
26
- model_path="/workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf",
27
- # n_gpu_layers=-1, # Uncomment to use GPU acceleration
28
- # seed=1337, # Uncomment to set a specific seed
29
- # n_ctx=2048, # Uncomment to increase the context window
30
  tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
31
  verbose=False,
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def generate_query(self, messages):
36
  """
@@ -62,29 +75,42 @@ class Qwen2Simulator(Simulator):
62
 
63
 
64
  def _generate(self, inputs):
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # stream=False
66
  output = self.llm(
67
  inputs,
68
  max_tokens=20,
69
- temperature=0.7,
70
- stop=["<|im_end|>"]
71
  )
72
  output_text = output["choices"][0]["text"]
73
  return output_text
74
 
75
 
76
 
77
- bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
78
 
79
 
80
  if __name__ == "__main__":
81
 
82
- messages = [
83
- {"role": "system", "content": "you are a helpful assistant"},
84
- {"role": "user", "content": "What is the capital of France?"}
85
- ]
86
- output = bot.generate_response(messages)
87
- print(output)
88
 
89
  messages = [
90
  {"role": "system", "content": "you are a helpful assistant"},
 
1
  """
2
  https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
3
  https://github.com/awinml/llama-cpp-python-bindings
4
+
5
+ python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
6
+
7
+ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/
8
+
9
+
10
+ ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
11
+
12
+ ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
13
+
14
+ ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
15
+
16
  """
17
 
18
  from simulator import Simulator
19
+ import llama_cpp
20
+ # import llama_cpp.llama_tokenizer
21
  from transformers import AutoTokenizer
22
 
23
 
24
  class Qwen2Simulator(Simulator):
25
 
26
  def __init__(self, model_name_or_path=None):
27
+ self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Chat")
28
+ self.llm = llama_cpp.Llama.from_pretrained(
29
+ repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
30
+ filename="*fp16.gguf",
 
 
 
 
 
 
 
 
 
 
 
31
  tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
32
  verbose=False,
33
  )
34
 
35
+ ### local
36
+ # self.hf_tokenizer = AutoTokenizer.from_pretrained("/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/")
37
+ # self.llm = Llama(
38
+ # model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat-GGUF/qwen2-0_5b-chat-q8_0.gguf",
39
+ # # model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/Qwen2-0.5B-Chat-F16.gguf",
40
+ # # n_gpu_layers=-1, # Uncomment to use GPU acceleration
41
+ # # seed=1337, # Uncomment to set a specific seed
42
+ # # n_ctx=2048, # Uncomment to increase the context window
43
+ # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
44
+ # verbose=False,
45
+ # )
46
+
47
 
48
  def generate_query(self, messages):
49
  """
 
75
 
76
 
77
  def _generate(self, inputs):
78
+ """
79
+ qwen2-0.5b-chat 有bug:有时user生成结束没有<|im_end|>,示例:
80
+ <|im_start|>system
81
+ you are a helpful assistant<|im_end|>
82
+ <|im_start|>user
83
+ hi, what your name<|im_end|>
84
+ <|im_start|>assistant
85
+ My name is Jordan<|im_end|>
86
+ <|im_start|>user # 以上是输入,以下是生成
87
+ how old are you?
88
+ <|im_start|>assistant
89
+ I am a 41-year-old man.<|im_end|>
90
+ """
91
  # stream=False
92
  output = self.llm(
93
  inputs,
94
  max_tokens=20,
95
+ temperature=5,
96
+ stop=["<|im_end|>", "<|im_start|>"]
97
  )
98
  output_text = output["choices"][0]["text"]
99
  return output_text
100
 
101
 
102
 
103
+ bot = Qwen2Simulator()
104
 
105
 
106
  if __name__ == "__main__":
107
 
108
+ # messages = [
109
+ # {"role": "system", "content": "you are a helpful assistant"},
110
+ # {"role": "user", "content": "What is the capital of France?"}
111
+ # ]
112
+ # output = bot.generate_response(messages)
113
+ # print(output)
114
 
115
  messages = [
116
  {"role": "system", "content": "you are a helpful assistant"},
models/{qwen2_util.py → hf_qwen2.py} RENAMED
@@ -4,43 +4,50 @@ from threading import Thread
4
  from simulator import Simulator
5
 
6
  from transformers import TextIteratorStreamer
 
7
 
8
 
9
  class Qwen2Simulator(Simulator):
10
 
11
- def generate_query(self, history):
12
-
13
- inputs = ""
14
- if history:
15
- messages = []
16
- for query, response in history:
17
- messages += [
18
- {"role": "user", "content": query},
19
- {"role": "assistant", "content": response},
20
- ]
21
-
22
- inputs += self.tokenizer.apply_chat_template(
23
- messages,
24
- tokenize=False,
25
- add_generation_prompt=False,
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  inputs = inputs + "<|im_start|>user\n"
28
  input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
29
  return self._generate(input_ids)
30
  # for new_text in self._stream_generate(input_ids):
31
  # yield new_text
32
 
33
- def generate_response(self, query, history):
34
- messages = []
35
- for _query, _response in history:
36
- if _response is None:
37
- pass
38
- messages += [
39
- {"role": "user", "content": _query},
40
- {"role": "assistant", "content": _response},
41
- ]
42
- messages.append({"role": "user", "content": query})
43
-
44
  input_ids = self.tokenizer.apply_chat_template(
45
  messages,
46
  tokenize=True,
@@ -52,7 +59,6 @@ class Qwen2Simulator(Simulator):
52
  # yield new_text
53
 
54
  def _generate(self, input_ids):
55
-
56
  input_ids_length = input_ids.shape[-1]
57
  response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
58
  return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
@@ -72,14 +78,22 @@ class Qwen2Simulator(Simulator):
72
  yield new_text
73
 
74
 
75
- # bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
76
- bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
 
77
 
 
 
 
 
 
 
 
78
 
79
- #
80
- # history = [["hi, what your name", "rhino"]]
81
- # generated_query = bot.generate_query(history)
82
- # for char in generated_query:
83
- # print(char)
84
- #
85
- # bot.generate_response("1+2*3=", history)
 
4
  from simulator import Simulator
5
 
6
  from transformers import TextIteratorStreamer
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
 
9
 
10
  class Qwen2Simulator(Simulator):
11
 
12
+ def __init__(self, model_name_or_path):
13
+ """
14
+ 在传递 device_map 时,low_cpu_mem_usage 会自动设置为 True
15
+ """
16
+
17
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
18
+ self.model = AutoModelForCausalLM.from_pretrained(
19
+ model_name_or_path,
20
+ torch_dtype="auto",
21
+ device_map="auto"
22
+ )
23
+ self.model.eval()
24
+ self.generation_kwargs = dict(
25
+ do_sample=True,
26
+ temperature=0.7,
27
+ # repetition_penalty=
28
+ max_length=500,
29
+ max_new_tokens=200
30
+ )
31
+
32
+ def generate_query(self, messages):
33
+ """
34
+ :param messages:
35
+ :return:
36
+ """
37
+ assert messages[-1]["role"] != "user"
38
+ inputs = self.tokenizer.apply_chat_template(
39
+ messages,
40
+ tokenize=False,
41
+ add_generation_prompt=False,
42
+ )
43
  inputs = inputs + "<|im_start|>user\n"
44
  input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
45
  return self._generate(input_ids)
46
  # for new_text in self._stream_generate(input_ids):
47
  # yield new_text
48
 
49
+ def generate_response(self, messages):
50
+ assert messages[-1]["role"] == "user"
 
 
 
 
 
 
 
 
 
51
  input_ids = self.tokenizer.apply_chat_template(
52
  messages,
53
  tokenize=True,
 
59
  # yield new_text
60
 
61
  def _generate(self, input_ids):
 
62
  input_ids_length = input_ids.shape[-1]
63
  response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
64
  return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
 
78
  yield new_text
79
 
80
 
81
+ bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
82
+ # bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
83
+
84
 
85
+ if __name__ == "__main__":
86
+ # messages = [
87
+ # {"role": "system", "content": "you are a helpful assistant"},
88
+ # {"role": "user", "content": "hi, what your name"}
89
+ # ]
90
+ # output = bot.generate_response(messages)
91
+ # print(output)
92
 
93
+ messages = [
94
+ {"role": "system", "content": "you are a helpful assistant"},
95
+ {"role": "user", "content": "hi, what your name"},
96
+ {"role": "assistant", "content": "My name is Jordan"}
97
+ ]
98
+ output = bot.generate_query(messages)
99
+ print(output)
requirements.txt CHANGED
@@ -2,3 +2,4 @@ huggingface_hub==0.22.2
2
  transformers
3
  torch
4
  accelerate
 
 
2
  transformers
3
  torch
4
  accelerate
5
+ llama-cpp-python