David Yeung commited on
Commit
936a3fd
1 Parent(s): c40d733

first commit Qwen 0.5B

Browse files
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+
6
+ import gradio as gr
7
+ import hanzidentifier
8
+ import re
9
+
10
+ import chinese_converter
11
+
12
+ # %%
13
+ #Load the LLM model and pipeline directly
14
+ llm_model_name="Qwen/Qwen1.5-0.5B-Chat"
15
+
16
+ #pipe = pipeline("text2text-generation", model=model)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ llm_model_name
19
+ )
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
22
+
23
+ # %%
24
+ # %%
25
+ # loading the vector encoder
26
+ vec_model_name = "shibing624/text2vec-base-chinese"
27
+
28
+ encode_kwargs = {'normalize_embeddings': False}
29
+ model_kwargs = {'device': 'cpu'}
30
+
31
+ huggingface_embeddings= HuggingFaceEmbeddings(
32
+ model_name=vec_model_name,
33
+ model_kwargs=model_kwargs,
34
+ encode_kwargs = encode_kwargs
35
+ )
36
+
37
+
38
+ # %%
39
+ persist_directory = 'chroma/'
40
+ vectordb = Chroma(embedding_function=huggingface_embeddings,persist_directory=persist_directory)
41
+ print(vectordb._collection.count())
42
+
43
+ # %%
44
+ text_input_label=["谜面","謎面","Riddle"]
45
+ text_output_label=["谜底","謎底","Answer"]
46
+
47
+ clear_label = ["清除","清除","Clear"]
48
+ submit_label = ["提交","提交","Submit"]
49
+
50
+ # %%
51
+ # helper functions for prompt processing for this LLM
52
+
53
+ def preprocess(text):
54
+ text = text.replace("\n", "\\n").replace("\t", "\\t")
55
+ return text
56
+
57
+ def postprocess(text):
58
+ return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20',' ')
59
+
60
+
61
+ # get answer from LLM with prompt input
62
+ def answer(input_text,context=""):
63
+ prompt = f"{input_text}\n提示:\n{context}\n谜底是什么?请解释。"
64
+ prompt = prompt.strip()
65
+
66
+ print(prompt)
67
+ #text = preprocess(text)
68
+ #out_text = pipe(text)
69
+ messages = [
70
+ {"role": "system", "content": "You are a helpful assistant."},
71
+ {"role": "user", "content": prompt}
72
+ ]
73
+ text = tokenizer.apply_chat_template(
74
+ messages,
75
+ tokenize=False,
76
+ add_generation_prompt=True
77
+ )
78
+ model_inputs = tokenizer([text], return_tensors="pt").to(device="cpu")
79
+
80
+ generated_ids = model.generate(
81
+ model_inputs.input_ids,
82
+ max_new_tokens=512,
83
+ do_sample=False,
84
+ top_p=0.0
85
+ )
86
+ generated_ids = [
87
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
88
+ ]
89
+
90
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
91
+
92
+ #return out_text[0]["generated_text"]
93
+ return response
94
+ #return postprocess(out_text[0]["generated_text"])
95
+
96
+ # helper function for RAG
97
+ def helper_rag(text):
98
+ docs_out = vectordb.similarity_search_with_relevance_scores(text,k=1)
99
+ #docs_out = vectordb.max_marginal_relevance_search(text,k=5,fetch_k = 20, lambda_mult = 0.5)
100
+ context = ""
101
+ for doc in docs_out:
102
+ if doc[1] > 0.7:
103
+ context += doc[0].page_content + "\n"
104
+
105
+ return context
106
+
107
+ # helper function for prompt
108
+ def helper_text(text_input,radio=None):
109
+ chinese_type = "simplified"
110
+
111
+ if hanzidentifier.is_traditional(text_input):
112
+ text_input = chinese_converter.to_simplified(text_input)
113
+ chinese_type = "traditional"
114
+
115
+ text_input = re.sub(r'hint',"猜",text_input,flags=re.I)
116
+
117
+ if not any(c in text_input for c in ["猜", "打"]):
118
+ warning = "请给一个提示,提示格式,例子:猜一水果,打一字。"
119
+ if chinese_type == "traditional" or radio == "繁體中文":
120
+ warning = chinese_converter.to_traditional(warning)
121
+ return warning
122
+
123
+ text=f"""猜谜语:\n谜面:{text_input}
124
+ """
125
+
126
+ context = helper_rag(text)
127
+
128
+ output = answer(text,context=context)
129
+
130
+ print(output)
131
+
132
+ if chinese_type == "traditional":
133
+ output = chinese_converter.to_traditional(output)
134
+
135
+ #output = re.split(r'\s+',output)
136
+
137
+ return output
138
+
139
+ #return output[0]
140
+
141
+
142
+
143
+ # Gradio function for configure the language of UI
144
+ def change_language(radio,text_input,text_output,markdown,
145
+ markdown_msg1, markdown_msg2):
146
+ if radio == "简体中文":
147
+ index = 0
148
+ text_input_update=gr.Textbox.update(value = chinese_converter.to_simplified(text_input), label = text_input_label[index])
149
+ text_output_update=gr.Textbox.update(value = chinese_converter.to_simplified(text_output),label = text_output_label[index])
150
+ markdown_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown))
151
+ markdown_msg1_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown_msg1))
152
+ markdown_msg2_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown_msg2))
153
+ elif radio == "繁體中文":
154
+ index = 1
155
+ text_input_update=gr.Textbox.update(value = chinese_converter.to_traditional(text_input),label = text_input_label[index])
156
+ text_output_update=gr.Textbox.update(value = chinese_converter.to_traditional(text_output),label = text_output_label[index])
157
+ markdown_update=gr.Markdown.update(value = chinese_converter.to_traditional(markdown))
158
+ markdown_msg1_update=gr.Markdown.update(value = chinese_converter.to_traditional(markdown_msg1))
159
+ markdown_msg2_update=gr.Markdown.update(value = chinese_converter.to_traditional(markdown_msg2))
160
+ elif radio == "English":
161
+ index = 2
162
+ text_input_update=gr.Textbox.update(label = text_input_label[index])
163
+ text_output_update=gr.Textbox.update(label = text_output_label[index])
164
+ markdown_update=gr.Markdown.update(value = markdown)
165
+ markdown_msg1_update=gr.Markdown.update(value = markdown_msg1)
166
+ markdown_msg2_update=gr.Markdown.update(value = markdown_msg2)
167
+
168
+ else:
169
+ index = 0
170
+ text_input_update=gr.Textbox.update(label = text_input_label[index])
171
+ text_output_update=gr.Textbox.update(label = text_output_label[index])
172
+ markdown_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown))
173
+ markdown_msg1_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown_msg1))
174
+ markdown_msg2_update=gr.Markdown.update(value = chinese_converter.to_simplified(markdown_msg2))
175
+
176
+ clear_btn_update = gr.ClearButton.update(value = clear_label[index])
177
+ submit_btn_update = gr.Button.update(value = submit_label[index])
178
+
179
+ return [text_input_update,text_output_update,clear_btn_update,submit_btn_update,markdown_update,
180
+ markdown_msg1_update ,markdown_msg2_update]
181
+
182
+
183
+ def clear_text():
184
+ text_input_update=gr.Textbox.update(value=None)
185
+ text_output_update=gr.Textbox.update(value=None)
186
+
187
+ return [text_input_update,text_output_update]
188
+
189
+
190
+ # %%
191
+ # css = """
192
+ # #markdown { background-image: url("file/data/DSC_0105.jpg");
193
+ # background-size: cover;
194
+ # }
195
+ # """
196
+
197
+ with gr.Blocks() as demo:
198
+ index = 0
199
+ example_list = [
200
+ ["小家伙穿黄袍,花丛中把房造。飞到西来飞到东,人人夸他爱劳动。(猜一动物)"],
201
+ ["一物生来身穿三百多件衣,每天脱一件,年底剩张皮。(猜一物品)"],
202
+ ["A thousand threads, a million strands. Reaching the water, vanishing all at once. (Hint: natural phenomenon)"],
203
+ ["无底洞(猜成语)"],
204
+ ]
205
+ radio = gr.Radio(
206
+ ["简体中文","繁體中文", "English"],show_label=False,value="简体中文"
207
+ )
208
+ markdown = gr.Markdown(
209
+ """
210
+ # Chinese Lantern Riddles Solver with LLM
211
+ ## 用大语言模型来猜灯谜
212
+ """,elem_id="markdown")
213
+ with gr.Row():
214
+ with gr.Column():
215
+ text_input = gr.Textbox(label=text_input_label[index],
216
+ value="小家伙穿黄袍,花丛中把房造。飞到西来飞到东,人人夸他爱劳动。(猜一动物)", lines = 2)
217
+ with gr.Row():
218
+ clear_btn = gr.ClearButton(value=clear_label[index],components=[text_input])
219
+ submit_btn = gr.Button(value=submit_label[index], variant = "primary")
220
+
221
+ text_output = gr.Textbox(label=text_output_label[index])
222
+
223
+
224
+ examples = gr.Examples(
225
+ examples=example_list,
226
+ inputs=text_input,
227
+ outputs=text_output,
228
+ fn=helper_text,
229
+ cache_examples=True,
230
+ )
231
+ markdown_msg1 = gr.Markdown(
232
+ """
233
+ 灯谜是中华文化特色文娱活动,自北宋盛行。每年逢正月十五元宵节,将谜语贴在花灯上,让大家可一起猜谜。
234
+
235
+ Lantern riddle is a traditional Chinese cultural activity. Being popular since the Song Dynasty (960-1276), it \
236
+ is held in the Lantern Festival (15th day of the first lunar month). \
237
+ When people are viewing the flower lanterns, they can guess the riddles on the lanterns together.
238
+
239
+
240
+ """
241
+ )
242
+
243
+ with gr.Column():
244
+ markdown_msg2 = gr.Markdown(
245
+ """
246
+ ![lantern](file/data/DSC_0105.jpg)
247
+
248
+ ---
249
+ # 声明 Disclaimer
250
+
251
+ 本应用输出的文本为机器基于模型生成的结果,不代表任何人观点,请谨慎辨别和参考。请在法律允许的范围内使用。
252
+
253
+ 本应用调用了 [ChatYuan-large-v2](https://huggingface.co/ClueAI/ChatYuan-large-v2) 对话语言大模型,\
254
+ 使用本应用前请务必阅读和同意遵守其[使用授权许可证](https://huggingface.co/ClueAI/ChatYuan-large-v2/blob/main/LICENSE)。
255
+
256
+ 本应用仅供非商业用途。
257
+
258
+ The outputs of this application are machine-generated with a statistical model. \
259
+ The outputs do not reflect any opinions of any human subjects. You must identify the outputs in caution. \
260
+ It is your responsbility to decide whether to accept the outputs. You must use the applicaiton in obedience to the Law.
261
+
262
+ This application utilizes [ChatYuan-large-v2](https://huggingface.co/ClueAI/ChatYuan-large-v2) \
263
+ Conversational Large Language Model. Before using this application, you must read and accept to follow \
264
+ the [LICENSE](https://huggingface.co/ClueAI/ChatYuan-large-v2/blob/main/LICENSE).
265
+
266
+ This application is for non-commercial use only.
267
+
268
+ ---
269
+
270
+ # 感谢 Acknowledgement
271
+
272
+ 本应用调用了 [text2vec-base-chinese](https://huggingface.co/shibing624/text2vec-base-chinese) 生成 text vector embeddings.
273
+ 该模型是以 [apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) 发行。
274
+
275
+ This application utilizes [text2vec-base-chinese](https://huggingface.co/shibing624/text2vec-base-chinese) to generate text vector embeddings.
276
+ The model is released under [apache-2.0](https://www.apache.org/licenses/LICENSE-2.0)。
277
+ """)
278
+
279
+
280
+
281
+
282
+
283
+
284
+ submit_btn.click(fn=helper_text, inputs=[text_input,radio], outputs=text_output)
285
+
286
+ clear_btn.click(fn=clear_text,outputs=[text_input,text_output])
287
+ radio.change(fn=change_language,inputs=[radio,text_input,text_output,
288
+ markdown, markdown_msg1,markdown_msg2],
289
+ outputs=[text_input,text_output,clear_btn,submit_btn,
290
+ markdown, markdown_msg1,markdown_msg2])
291
+
292
+ #demo = gr.Interface(fn=helper_text, inputs=text_input, outputs=text_output,
293
+ # flagging_options=["Inappropriate"],allow_flagging="never",
294
+ # title="aaa",description="aaa",article="aaa")
295
+ demo.queue(api_open=False)
296
+ demo.launch(show_api=False)
297
+
298
+
299
+ # %%
300
+
301
+
302
+
chroma/28c81268-af8d-4e89-9744-aee894bcbde6/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0067068c3258f065668e2ad17382df3424f1c001807139881c2dc1691772ac7b
3
+ size 28908000
chroma/28c81268-af8d-4e89-9744-aee894bcbde6/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b630aa796b842f9cb3af4a5f7cbc9fd3fc793047cec78a183ac3eaa83d68497
3
+ size 100
chroma/28c81268-af8d-4e89-9744-aee894bcbde6/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1f937c780ce5ab483dc40c3bf8d8408b3fa3d1bf03b5d20488b0d5b8a906fde
3
+ size 520085
chroma/28c81268-af8d-4e89-9744-aee894bcbde6/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a76c43e6fc007afd7f87d90b5a60a482449450491f8b307c5617b83471cf7a7
3
+ size 36000
chroma/28c81268-af8d-4e89-9744-aee894bcbde6/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcde28a793504ace57c738a180d964a0c8ed59bf30194cdde6f16484f267ba06
3
+ size 80540
chroma/c2db2536-5ceb-4ce6-b43c-7b5a8994dc6a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fab3b1442c2e9319c9406cb2db197e9b806a2d4122f8c3575c62d4a0fff1dc5
3
+ size 3212000
chroma/c2db2536-5ceb-4ce6-b43c-7b5a8994dc6a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61f185c42c6b3379360fa19d3ff03ef53f86c938371ab55230c04b3aa051c549
3
+ size 100
chroma/c2db2536-5ceb-4ce6-b43c-7b5a8994dc6a/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca24af71d714f0248ef6b2057c614abc943d509f5289bfdaf8d41d3f4d38d22c
3
+ size 55974
chroma/c2db2536-5ceb-4ce6-b43c-7b5a8994dc6a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9814e6d812bbb532273fec5061e2a38f55f8cab0dbe292ff0ba8d230076943ce
3
+ size 4000
chroma/c2db2536-5ceb-4ce6-b43c-7b5a8994dc6a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f5dc8c4171f8df8a126112a3ebe4e91ccb0895344eff083d43ad767f6effc54
3
+ size 8148
data/DSC_0105.jpg ADDED
data/riddles_data ADDED
The diff for this file is too large to render. See raw diff
 
gradio_cached_examples/10/log.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 谜底,flag,username,timestamp
2
+ 这个谜面描述了一个小家伙穿着黄色的袍子,在花丛中建造房子。这个小家伙是蜜蜂,因为蜜蜂在花朵上采集花蜜,然后将这些花蜜转化为蜂蜜。所以答案是蜜蜂。,,,2024-02-19 01:00:57.388598
3
+ 谜底是日历。因为日历上的日期会随着时间的推移而改变,所以每过一年,就会有一层新的布料覆盖在上面,这就是所谓的“年轮”。因此,当最后一层布料被剥落时,就只剩下了一张没有了皮的日历。,,,2024-02-19 01:01:02.376888
4
+ "这个谜面描述了一个自然现象,即“千条线,万条线,掉到水里看不见”。这个现象通常指的是水流在地面上形成的小水滴或小水泡,这些小水滴或小水泡会随着水流的流动而消失不见。
5
+
6
+ 这个谜底是雨水,因为雨水是由水滴和小水泡组成的,当它们落在地面上时,由于受到重力的作用,就会被冲走。因此,这个谜底就是雨水。",,,2024-02-19 01:01:10.248405
7
+ 这个谜面是通过“无底洞”来比喻一个无法预测或理解的、非常深邃的地方,因此可以推断出成语“深不可测”。,,,2024-02-19 01:01:13.125556