Spaces:

xu-song
/

self-chat

Sleeping

App Files Files Community

xu song commited on Aug 24

Commit

a752f35

•

1 Parent(s): b70508d

update

Browse files

Files changed (3) hide show

app.py +6 -6
app_util.py +4 -0
models/cpp_qwen2.py +3 -2

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ with gr.Blocks() as demo:
     # Knowledge Distillation through Self Chatting
     # Distilling the Knowledge from LLM through Self Chatting
     # Generating Synthetic Data through Self Chat
-    gr.HTML("""<h1 align="center">Generating Synthetic Data through Self Chat</h1>""")
     with gr.Row():
         with gr.Column(scale=5):
             system = gr.Dropdown(
@@ -58,10 +58,10 @@ with gr.Blocks() as demo:
                     undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                     # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
-                gr.Markdown(
-                    "Self-chat is a demo, which makes the model talk to itself. "
-                    "It is based on user simulator and response generator.",
-                    visible=True)
             # 也叫 chat-assistant,
             with gradio.Tab("Response Generator", visible=False):
@@ -96,7 +96,7 @@ with gr.Blocks() as demo:
             )
             with gr.Accordion(label="Parameters", open=True):
                 slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
-                                                  value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max tokens")
                 slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
                                                value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
                                                info="Larger temperature increase the randomness")

     # Knowledge Distillation through Self Chatting
     # Distilling the Knowledge from LLM through Self Chatting
     # Generating Synthetic Data through Self Chat
+    gr.HTML("""<h1 align="center">Generating Synthetic Data Through Self-Chat</h1>""")
     with gr.Row():
         with gr.Column(scale=5):
             system = gr.Dropdown(
                     undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                     # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
+                # gr.Markdown(
+                #     "Self-chat is a demo, which makes the model talk to itself. "
+                #     "It is based on user simulator and response generator.",
+                #     visible=True)
             # 也叫 chat-assistant,
             with gradio.Tab("Response Generator", visible=False):
             )
             with gr.Accordion(label="Parameters", open=True):
                 slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
+                                                  value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max New tokens")
                 slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
                                                value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
                                                info="Larger temperature increase the randomness")

app_util.py CHANGED Viewed

@@ -21,9 +21,13 @@ from models.cpp_qwen2 import bot
 system_list = [
     "You are a helpful assistant.",
     "你是一个导游。",
     "你是一个英语老师。",
     "你是一个程序员。",
     "你是一个心理咨询师。",
 ]
 bot.pre_cache_system(system_list)

 system_list = [
     "You are a helpful assistant.",
     "你是一个导游。",
+    "你是一名投资经理。",
+    "你是一名医生。",
     "你是一个英语老师。",
     "你是一个程序员。",
     "你是一个心理咨询师。",
+    "你是一名AI写作助手。"
+    "你是一名作家，擅长写小说。"
 ]
 bot.pre_cache_system(system_list)

models/cpp_qwen2.py CHANGED Viewed

@@ -125,7 +125,7 @@ class Qwen2Simulator(Simulator):
         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
         # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
-        cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)
         self.llm.set_cache(cache)
     def tokenize(self, text):
@@ -203,13 +203,14 @@ class Qwen2Simulator(Simulator):
             output = self.llm.create_completion(
                 input_ids,
                 stream=False,
-                max_tokens=3,
                 top_k=1
             )
             logger.info(f"cache size {self.llm.cache.cache_size}")
         # disable cache after
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
     def post_cache(self, suffix_tokens):
         """ warmup for next turn generation

         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
         # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
+        cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30) # 2G
         self.llm.set_cache(cache)
     def tokenize(self, text):
             output = self.llm.create_completion(
                 input_ids,
                 stream=False,
+                max_tokens=1,
                 top_k=1
             )
             logger.info(f"cache size {self.llm.cache.cache_size}")
         # disable cache after
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
+        llama_cpp.Llama.save_state = lambda *args: None
     def post_cache(self, suffix_tokens):
         """ warmup for next turn generation