xu song
commited on
Commit
•
a752f35
1
Parent(s):
b70508d
update
Browse files- app.py +6 -6
- app_util.py +4 -0
- models/cpp_qwen2.py +3 -2
app.py
CHANGED
@@ -32,7 +32,7 @@ with gr.Blocks() as demo:
|
|
32 |
# Knowledge Distillation through Self Chatting
|
33 |
# Distilling the Knowledge from LLM through Self Chatting
|
34 |
# Generating Synthetic Data through Self Chat
|
35 |
-
gr.HTML("""<h1 align="center">Generating Synthetic Data
|
36 |
with gr.Row():
|
37 |
with gr.Column(scale=5):
|
38 |
system = gr.Dropdown(
|
@@ -58,10 +58,10 @@ with gr.Blocks() as demo:
|
|
58 |
undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
|
59 |
clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
|
60 |
# stop_btn = gr.Button("停止生成", variant="stop", visible=False)
|
61 |
-
gr.Markdown(
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
# 也叫 chat-assistant,
|
67 |
with gradio.Tab("Response Generator", visible=False):
|
@@ -96,7 +96,7 @@ with gr.Blocks() as demo:
|
|
96 |
)
|
97 |
with gr.Accordion(label="Parameters", open=True):
|
98 |
slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
|
99 |
-
value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max tokens")
|
100 |
slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
|
101 |
value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
|
102 |
info="Larger temperature increase the randomness")
|
|
|
32 |
# Knowledge Distillation through Self Chatting
|
33 |
# Distilling the Knowledge from LLM through Self Chatting
|
34 |
# Generating Synthetic Data through Self Chat
|
35 |
+
gr.HTML("""<h1 align="center">Generating Synthetic Data Through Self-Chat</h1>""")
|
36 |
with gr.Row():
|
37 |
with gr.Column(scale=5):
|
38 |
system = gr.Dropdown(
|
|
|
58 |
undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
|
59 |
clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm", ) # 🧹 Clear History (清除历史)
|
60 |
# stop_btn = gr.Button("停止生成", variant="stop", visible=False)
|
61 |
+
# gr.Markdown(
|
62 |
+
# "Self-chat is a demo, which makes the model talk to itself. "
|
63 |
+
# "It is based on user simulator and response generator.",
|
64 |
+
# visible=True)
|
65 |
|
66 |
# 也叫 chat-assistant,
|
67 |
with gradio.Tab("Response Generator", visible=False):
|
|
|
96 |
)
|
97 |
with gr.Accordion(label="Parameters", open=True):
|
98 |
slider_max_new_tokens = gr.Slider(minimum=1, maximum=4096,
|
99 |
+
value=config.DEFAULT_MAX_NEW_TOKENS, step=1, label="Max New tokens")
|
100 |
slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
|
101 |
value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
|
102 |
info="Larger temperature increase the randomness")
|
app_util.py
CHANGED
@@ -21,9 +21,13 @@ from models.cpp_qwen2 import bot
|
|
21 |
system_list = [
|
22 |
"You are a helpful assistant.",
|
23 |
"你是一个导游。",
|
|
|
|
|
24 |
"你是一个英语老师。",
|
25 |
"你是一个程序员。",
|
26 |
"你是一个心理咨询师。",
|
|
|
|
|
27 |
]
|
28 |
|
29 |
bot.pre_cache_system(system_list)
|
|
|
21 |
system_list = [
|
22 |
"You are a helpful assistant.",
|
23 |
"你是一个导游。",
|
24 |
+
"你是一名投资经理。",
|
25 |
+
"你是一名医生。",
|
26 |
"你是一个英语老师。",
|
27 |
"你是一个程序员。",
|
28 |
"你是一个心理咨询师。",
|
29 |
+
"你是一名AI写作助手。"
|
30 |
+
"你是一名作家,擅长写小说。"
|
31 |
]
|
32 |
|
33 |
bot.pre_cache_system(system_list)
|
models/cpp_qwen2.py
CHANGED
@@ -125,7 +125,7 @@ class Qwen2Simulator(Simulator):
|
|
125 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
126 |
|
127 |
# cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
|
128 |
-
cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)
|
129 |
self.llm.set_cache(cache)
|
130 |
|
131 |
def tokenize(self, text):
|
@@ -203,13 +203,14 @@ class Qwen2Simulator(Simulator):
|
|
203 |
output = self.llm.create_completion(
|
204 |
input_ids,
|
205 |
stream=False,
|
206 |
-
max_tokens=
|
207 |
top_k=1
|
208 |
)
|
209 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
210 |
|
211 |
# disable cache after
|
212 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
|
|
213 |
|
214 |
def post_cache(self, suffix_tokens):
|
215 |
""" warmup for next turn generation
|
|
|
125 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
126 |
|
127 |
# cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
|
128 |
+
cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30) # 2G
|
129 |
self.llm.set_cache(cache)
|
130 |
|
131 |
def tokenize(self, text):
|
|
|
203 |
output = self.llm.create_completion(
|
204 |
input_ids,
|
205 |
stream=False,
|
206 |
+
max_tokens=1,
|
207 |
top_k=1
|
208 |
)
|
209 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
210 |
|
211 |
# disable cache after
|
212 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
213 |
+
llama_cpp.Llama.save_state = lambda *args: None
|
214 |
|
215 |
def post_cache(self, suffix_tokens):
|
216 |
""" warmup for next turn generation
|