xu song commited on
Commit
a752f35
1 Parent(s): b70508d
Files changed (3) hide show
  1. app.py +6 -6
  2. app_util.py +4 -0
  3. models/cpp_qwen2.py +3 -2
app.py CHANGED
@@ -32,7 +32,7 @@ with gr.Blocks() as demo:
32
  # Knowledge Distillation through Self Chatting
33
  # Distilling the Knowledge from LLM through Self Chatting
34
  # Generating Synthetic Data through Self Chat
35
- gr.HTML("""<h1 align="center">Generating Synthetic Data through Self Chat</h1>""")
36
  with gr.Row():
37
  with gr.Column(scale=5):
38
  system = gr.Dropdown(
@@ -58,10 +58,10 @@ with gr.Blocks() as demo:
58
  undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
59
  clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
60
  # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
61
- gr.Markdown(
62
- "Self-chat is a demo, which makes the model talk to itself. "
63
- "It is based on user simulator and response generator.",
64
- visible=True)
65
 
66
  # 也叫 chat-assistant,
67
  with gradio.Tab("Response Generator", visible=False):
@@ -96,7 +96,7 @@ with gr.Blocks() as demo:
96
  )
97
  with gr.Accordion(label="Parameters", open=True):
98
  slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
99
- value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max tokens")
100
  slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
101
  value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
102
  info="Larger temperature increase the randomness")
 
32
  # Knowledge Distillation through Self Chatting
33
  # Distilling the Knowledge from LLM through Self Chatting
34
  # Generating Synthetic Data through Self Chat
35
+ gr.HTML("""<h1 align="center">Generating Synthetic Data Through Self-Chat</h1>""")
36
  with gr.Row():
37
  with gr.Column(scale=5):
38
  system = gr.Dropdown(
 
58
  undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
59
  clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
60
  # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
61
+ # gr.Markdown(
62
+ # "Self-chat is a demo, which makes the model talk to itself. "
63
+ # "It is based on user simulator and response generator.",
64
+ # visible=True)
65
 
66
  # 也叫 chat-assistant,
67
  with gradio.Tab("Response Generator", visible=False):
 
96
  )
97
  with gr.Accordion(label="Parameters", open=True):
98
  slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
99
+ value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max New tokens")
100
  slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
101
  value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
102
  info="Larger temperature increase the randomness")
app_util.py CHANGED
@@ -21,9 +21,13 @@ from models.cpp_qwen2 import bot
21
  system_list = [
22
  "You are a helpful assistant.",
23
  "你是一个导游。",
 
 
24
  "你是一个英语老师。",
25
  "你是一个程序员。",
26
  "你是一个心理咨询师。",
 
 
27
  ]
28
 
29
  bot.pre_cache_system(system_list)
 
21
  system_list = [
22
  "You are a helpful assistant.",
23
  "你是一个导游。",
24
+ "你是一名投资经理。",
25
+ "你是一名医生。",
26
  "你是一个英语老师。",
27
  "你是一个程序员。",
28
  "你是一个心理咨询师。",
29
+ "你是一名AI写作助手。"
30
+ "你是一名作家,擅长写小说。"
31
  ]
32
 
33
  bot.pre_cache_system(system_list)
models/cpp_qwen2.py CHANGED
@@ -125,7 +125,7 @@ class Qwen2Simulator(Simulator):
125
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
126
 
127
  # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
128
- cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)
129
  self.llm.set_cache(cache)
130
 
131
  def tokenize(self, text):
@@ -203,13 +203,14 @@ class Qwen2Simulator(Simulator):
203
  output = self.llm.create_completion(
204
  input_ids,
205
  stream=False,
206
- max_tokens=3,
207
  top_k=1
208
  )
209
  logger.info(f"cache size {self.llm.cache.cache_size}")
210
 
211
  # disable cache after
212
  llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
 
213
 
214
  def post_cache(self, suffix_tokens):
215
  """ warmup for next turn generation
 
125
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
126
 
127
  # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
128
+ cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30) # 2G
129
  self.llm.set_cache(cache)
130
 
131
  def tokenize(self, text):
 
203
  output = self.llm.create_completion(
204
  input_ids,
205
  stream=False,
206
+ max_tokens=1,
207
  top_k=1
208
  )
209
  logger.info(f"cache size {self.llm.cache.cache_size}")
210
 
211
  # disable cache after
212
  llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
213
+ llama_cpp.Llama.save_state = lambda *args: None
214
 
215
  def post_cache(self, suffix_tokens):
216
  """ warmup for next turn generation