Spaces:

xu-song
/

self-chat

Running

App Files Files Community

xu song commited on Jul 29, 2024

Commit

8988bbf

1 Parent(s): b597747

update

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +69 -34
models/cpp_qwen2.py +52 -26
models/{qwen2_util.py → hf_qwen2.py} +51 -37
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 💬
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 4.39.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -2,6 +2,10 @@
 来自 https://github.com/OpenLMLab/MOSS/blob/main/moss_web_demo_gradio.py
 # 单卡报错
 python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
@@ -9,6 +13,11 @@ python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
 - 第一句：
 - 代码和表格的预览
 - 可编辑chatbot：https://github.com/gradio-app/gradio/issues/4444
 """
 from transformers.generation.utils import logger
@@ -18,13 +27,9 @@ import argparse
 import warnings
 import torch
 import os
-# from moss_util import generate_query
-from models.qwen2_util import bot
-# generate_query = None
-# gr.ChatInterface
-# from gpt35 import build_message_for_gpt35, send_one_query
 #
 # def postprocess(self, y):
@@ -75,61 +80,87 @@ def parse_text(text):
 def generate_query(chatbot, history):
-    if history and history[-1][1] is None:  # 该生成response了
         return None, chatbot, history
     query = bot.generate_query(history)
     # chatbot.append((query, ""))
     chatbot.append((query, None))
-    history = history + [(query, None)]
     return query, chatbot, history
 def generate_response(query, chatbot, history):
     """
-    自动模式下：query is None，或者 query = history[-1][0]
-    人工模式下：query 是任意值
     :param query:
     :param chatbot:
     :param history:
     :return:
     """
-    # messages = build_message_for_gpt35(query, history)
-    # response, success = send_one_query(query, messages, model="gpt-35-turbo")
-    # response = response["choices"][0]["message"]["content"]
-    #
-    if history[-1][1] is not None or chatbot[-1][1] is not None:
         return chatbot, history
-    if query is None:
-        query = history[-1][0]
-    response = bot.generate_response(query, history[:-1])
-    # chatbot.append((query, response))
-    history[-1] = (query, response)
     chatbot[-1] = (query, response)
     print(f"chatbot is {chatbot}")
     print(f"history is {history}")
     return chatbot, history
 def reset_user_input():
     return gr.update(value='')
-def reset_state():
-    return [], []
 """
 TODO: 使用说明
-avatar_images
 """
 with gr.Blocks() as demo:
-    gr.HTML("""<h1 align="center">欢迎使用 self chat 人工智能助手！</h1>""")
-    gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-    system = gr.Textbox(show_label=False, placeholder="You are a helpful assistant.")
-    chatbot = gr.Chatbot(avatar_images=("assets/profile.png", "assets/bot.png"))
     with gr.Row():
         with gr.Column(scale=4):
             user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10)
@@ -156,18 +187,22 @@ with gr.Blocks() as demo:
                 # info="Will add more animals later!"
             ),
-    history = gr.State([])  # (message, bot_message)
     submit_btn.click(generate_response, [user_input, chatbot, history], [chatbot, history],
-                    show_progress=True)
     # submit_btn.click(reset_user_input, [], [user_input])
-    clear_btn.click(reset_state, outputs=[chatbot, history], show_progress=True)
-    generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history], show_progress=True)
     gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-    gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
     gr.Slider(
         minimum=0.1,
         maximum=1.0,

 来自 https://github.com/OpenLMLab/MOSS/blob/main/moss_web_demo_gradio.py
+# 难点
 # 单卡报错
 python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
 - 第一句：
 - 代码和表格的预览
 - 可编辑chatbot：https://github.com/gradio-app/gradio/issues/4444
+- 一个button，
+## Reference
+-
 """
 from transformers.generation.utils import logger
 import warnings
 import torch
 import os
+# from models.hf_qwen2 import bot
+from models.cpp_qwen2 import bot
 #
 # def postprocess(self, y):
 def generate_query(chatbot, history):
+    if history and history[-1]["role"] == "user":  # 该生成response了
+        gr.Warning('You should generate assistant-response.')
         return None, chatbot, history
     query = bot.generate_query(history)
     # chatbot.append((query, ""))
     chatbot.append((query, None))
+    history.append({"role": "user", "content": query})
     return query, chatbot, history
 def generate_response(query, chatbot, history):
     """
+    自动模式下：query is None
+    人工模式下：query 是用户输入
     :param query:
     :param chatbot:
     :param history:
     :return:
     """
+    if query and history[-1]["role"] != "user":
+        history.append({"role": "user", "content": query})
+    if history[-1]["role"] != "user":
+        gr.Warning('You should generate or type user-input first.')
         return chatbot, history
+    response = bot.generate_response(history)
+    query = history[-1]["content"]
     chatbot[-1] = (query, response)
+    history.append({"role": "assistant", "content": response})
     print(f"chatbot is {chatbot}")
     print(f"history is {history}")
     return chatbot, history
+def generate():
+    """
+    :return:
+    """
+    pass
+def regenerate():
+    """
+    删除上一轮，重新生成。
+    :return:
+    """
+    pass
 def reset_user_input():
     return gr.update(value='')
+def reset_state(system):
+    return [], [{"role": "system", "content": system}]
+system_list = [
+    "You are a helpful assistant.",
+    "你是一个导游。",
+    "你是一个英语老师。",
+    "你是一个程序员。",
+    "你是一个心理咨询师。",
+]
 """
 TODO: 使用说明
 """
 with gr.Blocks() as demo:
+    # Knowledge Distillation through Self Chatting
+    gr.HTML("""<h1 align="center">Distilling the Knowledge through Self Chatting</h1>""")
+    system = gr.Dropdown(
+        choices=system_list,
+        value=system_list[0],
+        allow_custom_value=True,
+        interactive=True,
+        label="System message"
+    )
+    chatbot = gr.Chatbot(avatar_images=("assets/man.png", "assets/bot.png"))
     with gr.Row():
         with gr.Column(scale=4):
             user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10)
                 # info="Will add more animals later!"
             ),
+    history = gr.State([{"role": "system", "content": system_list[0]}])
+    system.change(reset_state, inputs=[system], outputs=[chatbot, history], show_progress="full")
     submit_btn.click(generate_response, [user_input, chatbot, history], [chatbot, history],
+                     show_progress="full")
     # submit_btn.click(reset_user_input, [], [user_input])
+    clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history], show_progress="full")
+    generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history],
+                             show_progress="full")
     gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+    gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature",
+              info="Larger temperature increase the randomness"),
     gr.Slider(
         minimum=0.1,
         maximum=1.0,

models/cpp_qwen2.py CHANGED Viewed

@@ -1,36 +1,49 @@
 """
 https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
 https://github.com/awinml/llama-cpp-python-bindings
 """
 from simulator import Simulator
-from llama_cpp import Llama
-import llama_cpp.llama_tokenizer
 from transformers import AutoTokenizer
 class Qwen2Simulator(Simulator):
     def __init__(self, model_name_or_path=None):
-        # self.llm = llama_cpp.Llama.from_pretrained(
-        #     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
-        #     filename="*q8_0.gguf",  #
-        #     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        #         "Qwen/Qwen1.5-0.5B-Chat"
-        #     ),
-        #     verbose=False,
-        # )
-        self.hf_tokenizer = AutoTokenizer.from_pretrained("/workspace/czy/model_weights/Qwen1.5-0.5B-Chat/")
-        self.llm = Llama(
-            model_path="/workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf",
-            # n_gpu_layers=-1, # Uncomment to use GPU acceleration
-            # seed=1337, # Uncomment to set a specific seed
-            # n_ctx=2048, # Uncomment to increase the context window
             tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
             verbose=False,
         )
     def generate_query(self, messages):
         """
@@ -62,29 +75,42 @@ class Qwen2Simulator(Simulator):
     def _generate(self, inputs):
         # stream=False
         output = self.llm(
             inputs,
             max_tokens=20,
-            temperature=0.7,
-            stop=["<|im_end|>"]
         )
         output_text = output["choices"][0]["text"]
         return output_text
-bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
 if __name__ == "__main__":
-    messages = [
-        {"role": "system", "content": "you are a helpful assistant"},
-        {"role": "user", "content": "What is the capital of France?"}
-    ]
-    output = bot.generate_response(messages)
-    print(output)
     messages = [
         {"role": "system", "content": "you are a helpful assistant"},

 """
 https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
 https://github.com/awinml/llama-cpp-python-bindings
+python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
+python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/
+./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
+./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
+./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
 """
 from simulator import Simulator
+import llama_cpp
+# import llama_cpp.llama_tokenizer
 from transformers import AutoTokenizer
 class Qwen2Simulator(Simulator):
     def __init__(self, model_name_or_path=None):
+        self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Chat")
+        self.llm = llama_cpp.Llama.from_pretrained(
+            repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
+            filename="*fp16.gguf",
             tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
             verbose=False,
         )
+        ### local
+        # self.hf_tokenizer = AutoTokenizer.from_pretrained("/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/")
+        # self.llm = Llama(
+        #     model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat-GGUF/qwen2-0_5b-chat-q8_0.gguf",
+        #     # model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/Qwen2-0.5B-Chat-F16.gguf",
+        #     # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+        #     # seed=1337, # Uncomment to set a specific seed
+        #     # n_ctx=2048, # Uncomment to increase the context window
+        #     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
+        #     verbose=False,
+        # )
     def generate_query(self, messages):
         """
     def _generate(self, inputs):
+        """
+        qwen2-0.5b-chat 有bug：有时user生成结束没有<|im_end|>，示例：
+            <|im_start|>system
+            you are a helpful assistant<|im_end|>
+            <|im_start|>user
+            hi, what your name<|im_end|>
+            <|im_start|>assistant
+            My name is Jordan<|im_end|>
+            <|im_start|>user              # 以上是输入，以下是生成
+            how old are you?
+            <|im_start|>assistant
+            I am a 41-year-old man.<|im_end|>
+        """
         # stream=False
         output = self.llm(
             inputs,
             max_tokens=20,
+            temperature=5,
+            stop=["<|im_end|>", "<|im_start|>"]
         )
         output_text = output["choices"][0]["text"]
         return output_text
+bot = Qwen2Simulator()
 if __name__ == "__main__":
+    # messages = [
+    #     {"role": "system", "content": "you are a helpful assistant"},
+    #     {"role": "user", "content": "What is the capital of France?"}
+    # ]
+    # output = bot.generate_response(messages)
+    # print(output)
     messages = [
         {"role": "system", "content": "you are a helpful assistant"},

models/{qwen2_util.py → hf_qwen2.py} RENAMED Viewed

@@ -4,43 +4,50 @@ from threading import Thread
 from simulator import Simulator
 from transformers import TextIteratorStreamer
 class Qwen2Simulator(Simulator):
-    def generate_query(self, history):
-        inputs = ""
-        if history:
-            messages = []
-            for query, response in history:
-                messages += [
-                    {"role": "user", "content": query},
-                    {"role": "assistant", "content": response},
-                ]
-            inputs += self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=False,
-            )
         inputs = inputs + "<|im_start|>user\n"
         input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
         return self._generate(input_ids)
         # for new_text in self._stream_generate(input_ids):
         #     yield new_text
-    def generate_response(self, query, history):
-        messages = []
-        for _query, _response in history:
-            if _response is None:
-                pass
-            messages += [
-                {"role": "user", "content": _query},
-                {"role": "assistant", "content": _response},
-            ]
-        messages.append({"role": "user", "content": query})
         input_ids = self.tokenizer.apply_chat_template(
             messages,
             tokenize=True,
@@ -52,7 +59,6 @@ class Qwen2Simulator(Simulator):
         #     yield new_text
     def _generate(self, input_ids):
         input_ids_length = input_ids.shape[-1]
         response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
         return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
@@ -72,14 +78,22 @@ class Qwen2Simulator(Simulator):
             yield new_text
-# bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
-bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
-#
-# history = [["hi, what your name", "rhino"]]
-# generated_query = bot.generate_query(history)
-# for char in generated_query:
-#     print(char)
-#
-# bot.generate_response("1+2*3=", history)

 from simulator import Simulator
 from transformers import TextIteratorStreamer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 class Qwen2Simulator(Simulator):
+    def __init__(self, model_name_or_path):
+        """
+        在传递 device_map 时，low_cpu_mem_usage 会自动设置为 True
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.model.eval()
+        self.generation_kwargs = dict(
+            do_sample=True,
+            temperature=0.7,
+            # repetition_penalty=
+            max_length=500,
+            max_new_tokens=200
+        )
+    def generate_query(self, messages):
+        """
+        :param messages:
+        :return:
+        """
+        assert messages[-1]["role"] != "user"
+        inputs = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
         inputs = inputs + "<|im_start|>user\n"
         input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
         return self._generate(input_ids)
         # for new_text in self._stream_generate(input_ids):
         #     yield new_text
+    def generate_response(self, messages):
+        assert messages[-1]["role"] == "user"
         input_ids = self.tokenizer.apply_chat_template(
             messages,
             tokenize=True,
         #     yield new_text
     def _generate(self, input_ids):
         input_ids_length = input_ids.shape[-1]
         response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
         return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
             yield new_text
+bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
+# bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
+if __name__ == "__main__":
+    # messages = [
+    #     {"role": "system", "content": "you are a helpful assistant"},
+    #     {"role": "user", "content": "hi, what your name"}
+    # ]
+    # output = bot.generate_response(messages)
+    # print(output)
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "hi, what your name"},
+        {"role": "assistant", "content": "My name is Jordan"}
+    ]
+    output = bot.generate_query(messages)
+    print(output)

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ huggingface_hub==0.22.2
 transformers
 torch
 accelerate

 transformers
 torch
 accelerate
+llama-cpp-python