Spaces:
Sleeping
Sleeping
new
Browse files- crazy_functions/game_fns/game_ascii_art.py +42 -0
- crazy_functions/game_fns/game_interactive_story.py +212 -0
- crazy_functions/game_fns/game_utils.py +35 -0
- crazy_functions/ipc_fns/mp.py +37 -0
- crazy_functions/pdf_fns/breakdown_txt.py +125 -0
- crazy_functions/vector_fns/__init__.py +0 -0
- crazy_functions/vector_fns/general_file_loader.py +70 -0
- crazy_functions/vector_fns/vector_database.py +338 -0
- crazy_functions/互动小游戏.py +40 -0
- crazy_functions/知识库问答.py +117 -0
- docs/GithubAction+AllCapacityBeta +53 -0
- docs/GithubAction+NoLocal+Vectordb +26 -0
- request_llms/bridge_qwen_local.py +59 -0
- request_llms/com_qwenapi.py +94 -0
- request_llms/requirements_qwen_local.txt +5 -0
- tests/test_vector_plugins.py +17 -0
- themes/cookies.py +0 -0
crazy_functions/game_fns/game_ascii_art.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from toolbox import CatchException, update_ui, update_ui_lastest_msg
|
2 |
+
from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState
|
3 |
+
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
4 |
+
from request_llms.bridge_all import predict_no_ui_long_connection
|
5 |
+
from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing
|
6 |
+
import random
|
7 |
+
|
8 |
+
|
9 |
+
class MiniGame_ASCII_Art(GptAcademicGameBaseState):
|
10 |
+
def step(self, prompt, chatbot, history):
|
11 |
+
if self.step_cnt == 0:
|
12 |
+
chatbot.append(["我画你猜(动物)", "请稍等..."])
|
13 |
+
else:
|
14 |
+
if prompt.strip() == 'exit':
|
15 |
+
self.delete_game = True
|
16 |
+
yield from update_ui_lastest_msg(lastmsg=f"谜底是{self.obj},游戏结束。", chatbot=chatbot, history=history, delay=0.)
|
17 |
+
return
|
18 |
+
chatbot.append([prompt, ""])
|
19 |
+
yield from update_ui(chatbot=chatbot, history=history)
|
20 |
+
|
21 |
+
if self.step_cnt == 0:
|
22 |
+
self.lock_plugin(chatbot)
|
23 |
+
self.cur_task = 'draw'
|
24 |
+
|
25 |
+
if self.cur_task == 'draw':
|
26 |
+
avail_obj = ["狗","猫","鸟","鱼","老鼠","蛇"]
|
27 |
+
self.obj = random.choice(avail_obj)
|
28 |
+
inputs = "I want to play a game called Guess the ASCII art. You can draw the ASCII art and I will try to guess it. " + \
|
29 |
+
f"This time you draw a {self.obj}. Note that you must not indicate what you have draw in the text, and you should only produce the ASCII art wrapped by ```. "
|
30 |
+
raw_res = predict_no_ui_long_connection(inputs=inputs, llm_kwargs=self.llm_kwargs, history=[], sys_prompt="")
|
31 |
+
self.cur_task = 'identify user guess'
|
32 |
+
res = get_code_block(raw_res)
|
33 |
+
history += ['', f'the answer is {self.obj}', inputs, res]
|
34 |
+
yield from update_ui_lastest_msg(lastmsg=res, chatbot=chatbot, history=history, delay=0.)
|
35 |
+
|
36 |
+
elif self.cur_task == 'identify user guess':
|
37 |
+
if is_same_thing(self.obj, prompt, self.llm_kwargs):
|
38 |
+
self.delete_game = True
|
39 |
+
yield from update_ui_lastest_msg(lastmsg="你猜对了!", chatbot=chatbot, history=history, delay=0.)
|
40 |
+
else:
|
41 |
+
self.cur_task = 'identify user guess'
|
42 |
+
yield from update_ui_lastest_msg(lastmsg="猜错了,再试试,输入“exit”获取答案。", chatbot=chatbot, history=history, delay=0.)
|
crazy_functions/game_fns/game_interactive_story.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prompts_hs = """ 请以“{headstart}”为开头,编写一个小说的第一幕。
|
2 |
+
|
3 |
+
- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。
|
4 |
+
- 出现人物时,给出人物的名字。
|
5 |
+
- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
|
6 |
+
- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
|
7 |
+
- 字数要求:第一幕的字数少于300字,且少于2个段落。
|
8 |
+
"""
|
9 |
+
|
10 |
+
prompts_interact = """ 小说的前文回顾:
|
11 |
+
「
|
12 |
+
{previously_on_story}
|
13 |
+
」
|
14 |
+
|
15 |
+
你是一个作家,根据以上的情节,给出4种不同的后续剧情发展方向,每个发展方向都精明扼要地用一句话说明。稍后,我将在这4个选择中,挑选一种剧情发展。
|
16 |
+
|
17 |
+
输出格式例如:
|
18 |
+
1. 后续剧情发展1
|
19 |
+
2. 后续剧情发展2
|
20 |
+
3. 后续剧情发展3
|
21 |
+
4. 后续剧情发展4
|
22 |
+
"""
|
23 |
+
|
24 |
+
|
25 |
+
prompts_resume = """小说的前文回顾:
|
26 |
+
「
|
27 |
+
{previously_on_story}
|
28 |
+
」
|
29 |
+
|
30 |
+
你是一个作家,我们正在互相讨论,确定后续剧情的发展。
|
31 |
+
在以下的剧情发展中,
|
32 |
+
「
|
33 |
+
{choice}
|
34 |
+
」
|
35 |
+
我认为更合理的是:{user_choice}。
|
36 |
+
请在前文的基础上(不要重复前文),围绕我选定的剧情情节,编写小说的下一幕。
|
37 |
+
|
38 |
+
- 禁止杜撰不符合我选择的剧情。
|
39 |
+
- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。
|
40 |
+
- 不要重复前文。
|
41 |
+
- 出现人物时,给出人物的名字。
|
42 |
+
- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
|
43 |
+
- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
|
44 |
+
- 小说的下一幕字数少于300字,且少于2个段落。
|
45 |
+
"""
|
46 |
+
|
47 |
+
|
48 |
+
prompts_terminate = """小说的前文回顾:
|
49 |
+
「
|
50 |
+
{previously_on_story}
|
51 |
+
」
|
52 |
+
|
53 |
+
你是一个作家,我们正在互相讨论,确定后续剧情的发展。
|
54 |
+
现在,故事该结束了,我认为最合理的故事结局是:{user_choice}。
|
55 |
+
|
56 |
+
请在前文的基础上(不要重复前文),编写小说的最后一幕。
|
57 |
+
|
58 |
+
- 不要重复前文。
|
59 |
+
- 出现人物时,给出人物的名字。
|
60 |
+
- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
|
61 |
+
- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
|
62 |
+
- 字数要求:最后一幕的字数少于1000字。
|
63 |
+
"""
|
64 |
+
|
65 |
+
|
66 |
+
from toolbox import CatchException, update_ui, update_ui_lastest_msg
|
67 |
+
from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState
|
68 |
+
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
69 |
+
from request_llms.bridge_all import predict_no_ui_long_connection
|
70 |
+
from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing
|
71 |
+
import random
|
72 |
+
|
73 |
+
|
74 |
+
class MiniGame_ResumeStory(GptAcademicGameBaseState):
|
75 |
+
story_headstart = [
|
76 |
+
'先行者知道,他现在是全宇宙中唯一的一个人了。',
|
77 |
+
'深夜,一个年轻人穿过天安门广场向纪念堂走去。在二十二世纪编年史中,计算机把他的代号定为M102。',
|
78 |
+
'他知道,这最后一课要提前讲了。又一阵剧痛从肝部袭来,几乎使他晕厥过去。',
|
79 |
+
'在距地球五万光年的远方,在银河系的中心,一场延续了两万年的星际战争已接近尾声。那里的太空中渐渐隐现出一个方形区域,仿佛灿烂的群星的背景被剪出一个方口。',
|
80 |
+
'伊依一行三人乘坐一艘游艇在南太平洋上做吟诗航行,他们的目的地是南极,如果几天后能顺利到达那里,他们将钻出地壳去看诗云。',
|
81 |
+
'很多人生来就会莫名其妙地迷上一样东西,仿佛他的出生就是要和这东西约会似的,正是这样,圆圆迷上了肥皂泡。'
|
82 |
+
]
|
83 |
+
|
84 |
+
|
85 |
+
def begin_game_step_0(self, prompt, chatbot, history):
|
86 |
+
# init game at step 0
|
87 |
+
self.headstart = random.choice(self.story_headstart)
|
88 |
+
self.story = []
|
89 |
+
chatbot.append(["互动写故事", f"这次的故事开头是:{self.headstart}"])
|
90 |
+
self.sys_prompt_ = '你是一个想象力丰富的杰出作家。正在与你的朋友互动,一起写故事,因此你每次写的故事段落应少于300字(结局除外)。'
|
91 |
+
|
92 |
+
|
93 |
+
def generate_story_image(self, story_paragraph):
|
94 |
+
try:
|
95 |
+
from crazy_functions.图片生成 import gen_image
|
96 |
+
prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落,进行简短的环境描写。要求:80字以内。')
|
97 |
+
image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural')
|
98 |
+
return f'<br/><div align="center"><img src="file={image_path}"></div>'
|
99 |
+
except:
|
100 |
+
return ''
|
101 |
+
|
102 |
+
def step(self, prompt, chatbot, history):
|
103 |
+
|
104 |
+
"""
|
105 |
+
首先,处理游戏初始化等特殊情况
|
106 |
+
"""
|
107 |
+
if self.step_cnt == 0:
|
108 |
+
self.begin_game_step_0(prompt, chatbot, history)
|
109 |
+
self.lock_plugin(chatbot)
|
110 |
+
self.cur_task = 'head_start'
|
111 |
+
else:
|
112 |
+
if prompt.strip() == 'exit' or prompt.strip() == '结束剧情':
|
113 |
+
# should we terminate game here?
|
114 |
+
self.delete_game = True
|
115 |
+
yield from update_ui_lastest_msg(lastmsg=f"游戏结束。", chatbot=chatbot, history=history, delay=0.)
|
116 |
+
return
|
117 |
+
if '剧情收尾' in prompt:
|
118 |
+
self.cur_task = 'story_terminate'
|
119 |
+
# # well, game resumes
|
120 |
+
# chatbot.append([prompt, ""])
|
121 |
+
# update ui, don't keep the user waiting
|
122 |
+
yield from update_ui(chatbot=chatbot, history=history)
|
123 |
+
|
124 |
+
|
125 |
+
"""
|
126 |
+
处理游戏的主体逻辑
|
127 |
+
"""
|
128 |
+
if self.cur_task == 'head_start':
|
129 |
+
"""
|
130 |
+
这是游戏的第一步
|
131 |
+
"""
|
132 |
+
inputs_ = prompts_hs.format(headstart=self.headstart)
|
133 |
+
history_ = []
|
134 |
+
story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
135 |
+
inputs_, '故事开头', self.llm_kwargs,
|
136 |
+
chatbot, history_, self.sys_prompt_
|
137 |
+
)
|
138 |
+
self.story.append(story_paragraph)
|
139 |
+
# # 配图
|
140 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
|
141 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
|
142 |
+
|
143 |
+
# # 构建后续剧情引导
|
144 |
+
previously_on_story = ""
|
145 |
+
for s in self.story:
|
146 |
+
previously_on_story += s + '\n'
|
147 |
+
inputs_ = prompts_interact.format(previously_on_story=previously_on_story)
|
148 |
+
history_ = []
|
149 |
+
self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
150 |
+
inputs_, '请在以下几种故事走向中,选择一种(当然,您也可以选择给出其他故事走向):', self.llm_kwargs,
|
151 |
+
chatbot,
|
152 |
+
history_,
|
153 |
+
self.sys_prompt_
|
154 |
+
)
|
155 |
+
self.cur_task = 'user_choice'
|
156 |
+
|
157 |
+
|
158 |
+
elif self.cur_task == 'user_choice':
|
159 |
+
"""
|
160 |
+
根据用户的提示,确定故事的下一步
|
161 |
+
"""
|
162 |
+
if '请在以下几种故事走向中,选择一种' in chatbot[-1][0]: chatbot.pop(-1)
|
163 |
+
previously_on_story = ""
|
164 |
+
for s in self.story:
|
165 |
+
previously_on_story += s + '\n'
|
166 |
+
inputs_ = prompts_resume.format(previously_on_story=previously_on_story, choice=self.next_choices, user_choice=prompt)
|
167 |
+
history_ = []
|
168 |
+
story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
169 |
+
inputs_, f'下一段故事(您的选择是:{prompt})。', self.llm_kwargs,
|
170 |
+
chatbot, history_, self.sys_prompt_
|
171 |
+
)
|
172 |
+
self.story.append(story_paragraph)
|
173 |
+
# # 配图
|
174 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
|
175 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
|
176 |
+
|
177 |
+
# # 构建后续剧情引导
|
178 |
+
previously_on_story = ""
|
179 |
+
for s in self.story:
|
180 |
+
previously_on_story += s + '\n'
|
181 |
+
inputs_ = prompts_interact.format(previously_on_story=previously_on_story)
|
182 |
+
history_ = []
|
183 |
+
self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
184 |
+
inputs_,
|
185 |
+
'请在以下几种故事走向中,选择一种。当然,您也可以给出您心中的其他故事走向。另外,如果您希望剧情立即收尾,请输入剧情走向,并以“剧情收尾”四个字提示程序。', self.llm_kwargs,
|
186 |
+
chatbot,
|
187 |
+
history_,
|
188 |
+
self.sys_prompt_
|
189 |
+
)
|
190 |
+
self.cur_task = 'user_choice'
|
191 |
+
|
192 |
+
|
193 |
+
elif self.cur_task == 'story_terminate':
|
194 |
+
"""
|
195 |
+
根据用户的提示,确定故事的结局
|
196 |
+
"""
|
197 |
+
previously_on_story = ""
|
198 |
+
for s in self.story:
|
199 |
+
previously_on_story += s + '\n'
|
200 |
+
inputs_ = prompts_terminate.format(previously_on_story=previously_on_story, user_choice=prompt)
|
201 |
+
history_ = []
|
202 |
+
story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
203 |
+
inputs_, f'故事收尾(您的选择是:{prompt})。', self.llm_kwargs,
|
204 |
+
chatbot, history_, self.sys_prompt_
|
205 |
+
)
|
206 |
+
# # 配图
|
207 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
|
208 |
+
yield from update_ui_lastest_msg(lastmsg=story_paragraph + '<br/>'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
|
209 |
+
|
210 |
+
# terminate game
|
211 |
+
self.delete_game = True
|
212 |
+
return
|
crazy_functions/game_fns/game_utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError
|
3 |
+
from request_llms.bridge_all import predict_no_ui_long_connection
|
4 |
+
def get_code_block(reply):
|
5 |
+
import re
|
6 |
+
pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks
|
7 |
+
matches = re.findall(pattern, reply) # find all code blocks in text
|
8 |
+
if len(matches) == 1:
|
9 |
+
return "```" + matches[0] + "```" # code block
|
10 |
+
raise RuntimeError("GPT is not generating proper code.")
|
11 |
+
|
12 |
+
def is_same_thing(a, b, llm_kwargs):
|
13 |
+
from pydantic import BaseModel, Field
|
14 |
+
class IsSameThing(BaseModel):
|
15 |
+
is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False)
|
16 |
+
|
17 |
+
def run_gpt_fn(inputs, sys_prompt, history=[]):
|
18 |
+
return predict_no_ui_long_connection(
|
19 |
+
inputs=inputs, llm_kwargs=llm_kwargs,
|
20 |
+
history=history, sys_prompt=sys_prompt, observe_window=[]
|
21 |
+
)
|
22 |
+
|
23 |
+
gpt_json_io = GptJsonIO(IsSameThing)
|
24 |
+
inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b)
|
25 |
+
inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing."
|
26 |
+
analyze_res_cot_01 = run_gpt_fn(inputs_01, "", [])
|
27 |
+
|
28 |
+
inputs_02 = inputs_01 + gpt_json_io.format_instructions
|
29 |
+
analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01])
|
30 |
+
|
31 |
+
try:
|
32 |
+
res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn)
|
33 |
+
return res.is_same_thing
|
34 |
+
except JsonStringError as e:
|
35 |
+
return False
|
crazy_functions/ipc_fns/mp.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import platform
|
2 |
+
import pickle
|
3 |
+
import multiprocessing
|
4 |
+
|
5 |
+
def run_in_subprocess_wrapper_func(v_args):
|
6 |
+
func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args)
|
7 |
+
import sys
|
8 |
+
try:
|
9 |
+
result = func(*args, **kwargs)
|
10 |
+
return_dict['result'] = result
|
11 |
+
except Exception as e:
|
12 |
+
exc_info = sys.exc_info()
|
13 |
+
exception_dict['exception'] = exc_info
|
14 |
+
|
15 |
+
def run_in_subprocess_with_timeout(func, timeout=60):
|
16 |
+
if platform.system() == 'Linux':
|
17 |
+
def wrapper(*args, **kwargs):
|
18 |
+
return_dict = multiprocessing.Manager().dict()
|
19 |
+
exception_dict = multiprocessing.Manager().dict()
|
20 |
+
v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict))
|
21 |
+
process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,))
|
22 |
+
process.start()
|
23 |
+
process.join(timeout)
|
24 |
+
if process.is_alive():
|
25 |
+
process.terminate()
|
26 |
+
raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务')
|
27 |
+
process.close()
|
28 |
+
if 'exception' in exception_dict:
|
29 |
+
# ooops, the subprocess ran into an exception
|
30 |
+
exc_info = exception_dict['exception']
|
31 |
+
raise exc_info[1].with_traceback(exc_info[2])
|
32 |
+
if 'result' in return_dict.keys():
|
33 |
+
# If the subprocess ran successfully, return the result
|
34 |
+
return return_dict['result']
|
35 |
+
return wrapper
|
36 |
+
else:
|
37 |
+
return func
|
crazy_functions/pdf_fns/breakdown_txt.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
|
2 |
+
|
3 |
+
def force_breakdown(txt, limit, get_token_fn):
|
4 |
+
""" 当无法用标点、空行分割时,我们用最暴力的方法切割
|
5 |
+
"""
|
6 |
+
for i in reversed(range(len(txt))):
|
7 |
+
if get_token_fn(txt[:i]) < limit:
|
8 |
+
return txt[:i], txt[i:]
|
9 |
+
return "Tiktoken未知错误", "Tiktoken未知错误"
|
10 |
+
|
11 |
+
|
12 |
+
def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage):
|
13 |
+
""" 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
14 |
+
当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出
|
15 |
+
"""
|
16 |
+
_min = int(5e4)
|
17 |
+
_max = int(1e5)
|
18 |
+
# print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage))
|
19 |
+
if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0:
|
20 |
+
remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage
|
21 |
+
remain_txt_to_cut_storage = ""
|
22 |
+
if len(remain_txt_to_cut) > _max:
|
23 |
+
remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage
|
24 |
+
remain_txt_to_cut = remain_txt_to_cut[:_max]
|
25 |
+
return remain_txt_to_cut, remain_txt_to_cut_storage
|
26 |
+
|
27 |
+
|
28 |
+
def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False):
|
29 |
+
""" 文本切分
|
30 |
+
"""
|
31 |
+
res = []
|
32 |
+
total_len = len(txt_tocut)
|
33 |
+
fin_len = 0
|
34 |
+
remain_txt_to_cut = txt_tocut
|
35 |
+
remain_txt_to_cut_storage = ""
|
36 |
+
# 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
|
37 |
+
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
38 |
+
|
39 |
+
while True:
|
40 |
+
if get_token_fn(remain_txt_to_cut) <= limit:
|
41 |
+
# 如果剩余文本的token数小于限制,那么就不用切了
|
42 |
+
res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut)
|
43 |
+
break
|
44 |
+
else:
|
45 |
+
# 如果剩余文本的token数大于限制,那么就切
|
46 |
+
lines = remain_txt_to_cut.split('\n')
|
47 |
+
|
48 |
+
# 估计一个切分点
|
49 |
+
estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines)
|
50 |
+
estimated_line_cut = int(estimated_line_cut)
|
51 |
+
|
52 |
+
# 开始查找合适切分点的偏移(cnt)
|
53 |
+
cnt = 0
|
54 |
+
for cnt in reversed(range(estimated_line_cut)):
|
55 |
+
if must_break_at_empty_line:
|
56 |
+
# 首先尝试用双空行(\n\n)作为切分点
|
57 |
+
if lines[cnt] != "":
|
58 |
+
continue
|
59 |
+
prev = "\n".join(lines[:cnt])
|
60 |
+
post = "\n".join(lines[cnt:])
|
61 |
+
if get_token_fn(prev) < limit:
|
62 |
+
break
|
63 |
+
|
64 |
+
if cnt == 0:
|
65 |
+
# 如果没有找到合适的切分点
|
66 |
+
if break_anyway:
|
67 |
+
# 是否允许暴力切分
|
68 |
+
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
69 |
+
else:
|
70 |
+
# 不允许直接报错
|
71 |
+
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
72 |
+
|
73 |
+
# 追加列表
|
74 |
+
res.append(prev); fin_len+=len(prev)
|
75 |
+
# 准备下一次迭代
|
76 |
+
remain_txt_to_cut = post
|
77 |
+
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
|
78 |
+
process = fin_len/total_len
|
79 |
+
print(f'正在文本切分 {int(process*100)}%')
|
80 |
+
if len(remain_txt_to_cut.strip()) == 0:
|
81 |
+
break
|
82 |
+
return res
|
83 |
+
|
84 |
+
|
85 |
+
def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"):
|
86 |
+
""" 使用多种方式尝试切分文本,以满足 token 限制
|
87 |
+
"""
|
88 |
+
from request_llms.bridge_all import model_info
|
89 |
+
enc = model_info[llm_model]['tokenizer']
|
90 |
+
def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
|
91 |
+
try:
|
92 |
+
# 第1次尝试,将双空行(\n\n)作为切分点
|
93 |
+
return cut(limit, get_token_fn, txt, must_break_at_empty_line=True)
|
94 |
+
except RuntimeError:
|
95 |
+
try:
|
96 |
+
# 第2次尝试,将单空行(\n)作为切分点
|
97 |
+
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False)
|
98 |
+
except RuntimeError:
|
99 |
+
try:
|
100 |
+
# 第3次尝试,将英文句号(.)作为切分点
|
101 |
+
res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
102 |
+
return [r.replace('。\n', '.') for r in res]
|
103 |
+
except RuntimeError as e:
|
104 |
+
try:
|
105 |
+
# 第4次尝试,将中文句号(。)作为切分点
|
106 |
+
res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False)
|
107 |
+
return [r.replace('。。\n', '。') for r in res]
|
108 |
+
except RuntimeError as e:
|
109 |
+
# 第5次尝试,没办法了,随便切一下吧
|
110 |
+
return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True)
|
111 |
+
|
112 |
+
breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60)
|
113 |
+
|
114 |
+
if __name__ == '__main__':
|
115 |
+
from crazy_functions.crazy_utils import read_and_clean_pdf_text
|
116 |
+
file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf")
|
117 |
+
|
118 |
+
from request_llms.bridge_all import model_info
|
119 |
+
for i in range(5):
|
120 |
+
file_content += file_content
|
121 |
+
|
122 |
+
print(len(file_content))
|
123 |
+
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
124 |
+
res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)
|
125 |
+
|
crazy_functions/vector_fns/__init__.py
ADDED
File without changes
|
crazy_functions/vector_fns/general_file_loader.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# From project chatglm-langchain
|
2 |
+
|
3 |
+
|
4 |
+
from langchain.document_loaders import UnstructuredFileLoader
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
import re
|
7 |
+
from typing import List
|
8 |
+
|
9 |
+
class ChineseTextSplitter(CharacterTextSplitter):
|
10 |
+
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
|
11 |
+
super().__init__(**kwargs)
|
12 |
+
self.pdf = pdf
|
13 |
+
self.sentence_size = sentence_size
|
14 |
+
|
15 |
+
def split_text1(self, text: str) -> List[str]:
|
16 |
+
if self.pdf:
|
17 |
+
text = re.sub(r"\n{3,}", "\n", text)
|
18 |
+
text = re.sub('\s', ' ', text)
|
19 |
+
text = text.replace("\n\n", "")
|
20 |
+
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
|
21 |
+
sent_list = []
|
22 |
+
for ele in sent_sep_pattern.split(text):
|
23 |
+
if sent_sep_pattern.match(ele) and sent_list:
|
24 |
+
sent_list[-1] += ele
|
25 |
+
elif ele:
|
26 |
+
sent_list.append(ele)
|
27 |
+
return sent_list
|
28 |
+
|
29 |
+
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
|
30 |
+
if self.pdf:
|
31 |
+
text = re.sub(r"\n{3,}", r"\n", text)
|
32 |
+
text = re.sub('\s', " ", text)
|
33 |
+
text = re.sub("\n\n", "", text)
|
34 |
+
|
35 |
+
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
|
36 |
+
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
|
37 |
+
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
|
38 |
+
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
|
39 |
+
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
|
40 |
+
text = text.rstrip() # 段尾如果有多余的\n就去掉它
|
41 |
+
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
|
42 |
+
ls = [i for i in text.split("\n") if i]
|
43 |
+
for ele in ls:
|
44 |
+
if len(ele) > self.sentence_size:
|
45 |
+
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
|
46 |
+
ele1_ls = ele1.split("\n")
|
47 |
+
for ele_ele1 in ele1_ls:
|
48 |
+
if len(ele_ele1) > self.sentence_size:
|
49 |
+
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
|
50 |
+
ele2_ls = ele_ele2.split("\n")
|
51 |
+
for ele_ele2 in ele2_ls:
|
52 |
+
if len(ele_ele2) > self.sentence_size:
|
53 |
+
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
|
54 |
+
ele2_id = ele2_ls.index(ele_ele2)
|
55 |
+
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
|
56 |
+
ele2_id + 1:]
|
57 |
+
ele_id = ele1_ls.index(ele_ele1)
|
58 |
+
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
|
59 |
+
|
60 |
+
id = ls.index(ele)
|
61 |
+
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
|
62 |
+
return ls
|
63 |
+
|
64 |
+
def load_file(filepath, sentence_size):
|
65 |
+
loader = UnstructuredFileLoader(filepath, mode="elements")
|
66 |
+
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
|
67 |
+
docs = loader.load_and_split(text_splitter=textsplitter)
|
68 |
+
# write_check_file(filepath, docs)
|
69 |
+
return docs
|
70 |
+
|
crazy_functions/vector_fns/vector_database.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# From project chatglm-langchain
|
2 |
+
|
3 |
+
import threading
|
4 |
+
from toolbox import Singleton
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import os
|
8 |
+
import uuid
|
9 |
+
import tqdm
|
10 |
+
from langchain.vectorstores import FAISS
|
11 |
+
from langchain.docstore.document import Document
|
12 |
+
from typing import List, Tuple
|
13 |
+
import numpy as np
|
14 |
+
from crazy_functions.vector_fns.general_file_loader import load_file
|
15 |
+
|
16 |
+
embedding_model_dict = {
|
17 |
+
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
|
18 |
+
"ernie-base": "nghuyong/ernie-3.0-base-zh",
|
19 |
+
"text2vec-base": "shibing624/text2vec-base-chinese",
|
20 |
+
"text2vec": "GanymedeNil/text2vec-large-chinese",
|
21 |
+
}
|
22 |
+
|
23 |
+
# Embedding model name
|
24 |
+
EMBEDDING_MODEL = "text2vec"
|
25 |
+
|
26 |
+
# Embedding running device
|
27 |
+
EMBEDDING_DEVICE = "cpu"
|
28 |
+
|
29 |
+
# 基于上下文的prompt模版,请务必保留"{question}"和"{context}"
|
30 |
+
PROMPT_TEMPLATE = """已知信息:
|
31 |
+
{context}
|
32 |
+
|
33 |
+
根据上述已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”,不允许在答案中添加编造成分,答案请使用中文。 问题是:{question}"""
|
34 |
+
|
35 |
+
# 文本分句长度
|
36 |
+
SENTENCE_SIZE = 100
|
37 |
+
|
38 |
+
# 匹配后单段上下文长度
|
39 |
+
CHUNK_SIZE = 250
|
40 |
+
|
41 |
+
# LLM input history length
|
42 |
+
LLM_HISTORY_LEN = 3
|
43 |
+
|
44 |
+
# return top-k text chunk from vector store
|
45 |
+
VECTOR_SEARCH_TOP_K = 5
|
46 |
+
|
47 |
+
# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准
|
48 |
+
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
49 |
+
|
50 |
+
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
|
51 |
+
|
52 |
+
FLAG_USER_NAME = uuid.uuid4().hex
|
53 |
+
|
54 |
+
# 是否开启跨域,默认为False,如果需要开启,请设置为True
|
55 |
+
# is open cross domain
|
56 |
+
OPEN_CROSS_DOMAIN = False
|
57 |
+
|
58 |
+
def similarity_search_with_score_by_vector(
|
59 |
+
self, embedding: List[float], k: int = 4
|
60 |
+
) -> List[Tuple[Document, float]]:
|
61 |
+
|
62 |
+
def seperate_list(ls: List[int]) -> List[List[int]]:
|
63 |
+
lists = []
|
64 |
+
ls1 = [ls[0]]
|
65 |
+
for i in range(1, len(ls)):
|
66 |
+
if ls[i - 1] + 1 == ls[i]:
|
67 |
+
ls1.append(ls[i])
|
68 |
+
else:
|
69 |
+
lists.append(ls1)
|
70 |
+
ls1 = [ls[i]]
|
71 |
+
lists.append(ls1)
|
72 |
+
return lists
|
73 |
+
|
74 |
+
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
75 |
+
docs = []
|
76 |
+
id_set = set()
|
77 |
+
store_len = len(self.index_to_docstore_id)
|
78 |
+
for j, i in enumerate(indices[0]):
|
79 |
+
if i == -1 or 0 < self.score_threshold < scores[0][j]:
|
80 |
+
# This happens when not enough docs are returned.
|
81 |
+
continue
|
82 |
+
_id = self.index_to_docstore_id[i]
|
83 |
+
doc = self.docstore.search(_id)
|
84 |
+
if not self.chunk_conent:
|
85 |
+
if not isinstance(doc, Document):
|
86 |
+
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
87 |
+
doc.metadata["score"] = int(scores[0][j])
|
88 |
+
docs.append(doc)
|
89 |
+
continue
|
90 |
+
id_set.add(i)
|
91 |
+
docs_len = len(doc.page_content)
|
92 |
+
for k in range(1, max(i, store_len - i)):
|
93 |
+
break_flag = False
|
94 |
+
for l in [i + k, i - k]:
|
95 |
+
if 0 <= l < len(self.index_to_docstore_id):
|
96 |
+
_id0 = self.index_to_docstore_id[l]
|
97 |
+
doc0 = self.docstore.search(_id0)
|
98 |
+
if docs_len + len(doc0.page_content) > self.chunk_size:
|
99 |
+
break_flag = True
|
100 |
+
break
|
101 |
+
elif doc0.metadata["source"] == doc.metadata["source"]:
|
102 |
+
docs_len += len(doc0.page_content)
|
103 |
+
id_set.add(l)
|
104 |
+
if break_flag:
|
105 |
+
break
|
106 |
+
if not self.chunk_conent:
|
107 |
+
return docs
|
108 |
+
if len(id_set) == 0 and self.score_threshold > 0:
|
109 |
+
return []
|
110 |
+
id_list = sorted(list(id_set))
|
111 |
+
id_lists = seperate_list(id_list)
|
112 |
+
for id_seq in id_lists:
|
113 |
+
for id in id_seq:
|
114 |
+
if id == id_seq[0]:
|
115 |
+
_id = self.index_to_docstore_id[id]
|
116 |
+
doc = self.docstore.search(_id)
|
117 |
+
else:
|
118 |
+
_id0 = self.index_to_docstore_id[id]
|
119 |
+
doc0 = self.docstore.search(_id0)
|
120 |
+
doc.page_content += " " + doc0.page_content
|
121 |
+
if not isinstance(doc, Document):
|
122 |
+
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
123 |
+
doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]])
|
124 |
+
doc.metadata["score"] = int(doc_score)
|
125 |
+
docs.append(doc)
|
126 |
+
return docs
|
127 |
+
|
128 |
+
|
129 |
+
class LocalDocQA:
|
130 |
+
llm: object = None
|
131 |
+
embeddings: object = None
|
132 |
+
top_k: int = VECTOR_SEARCH_TOP_K
|
133 |
+
chunk_size: int = CHUNK_SIZE
|
134 |
+
chunk_conent: bool = True
|
135 |
+
score_threshold: int = VECTOR_SEARCH_SCORE_THRESHOLD
|
136 |
+
|
137 |
+
def init_cfg(self,
|
138 |
+
top_k=VECTOR_SEARCH_TOP_K,
|
139 |
+
):
|
140 |
+
|
141 |
+
self.llm = None
|
142 |
+
self.top_k = top_k
|
143 |
+
|
144 |
+
def init_knowledge_vector_store(self,
|
145 |
+
filepath,
|
146 |
+
vs_path: str or os.PathLike = None,
|
147 |
+
sentence_size=SENTENCE_SIZE,
|
148 |
+
text2vec=None):
|
149 |
+
loaded_files = []
|
150 |
+
failed_files = []
|
151 |
+
if isinstance(filepath, str):
|
152 |
+
if not os.path.exists(filepath):
|
153 |
+
print("路径不存在")
|
154 |
+
return None
|
155 |
+
elif os.path.isfile(filepath):
|
156 |
+
file = os.path.split(filepath)[-1]
|
157 |
+
try:
|
158 |
+
docs = load_file(filepath, SENTENCE_SIZE)
|
159 |
+
print(f"{file} 已成功加载")
|
160 |
+
loaded_files.append(filepath)
|
161 |
+
except Exception as e:
|
162 |
+
print(e)
|
163 |
+
print(f"{file} 未能成功加载")
|
164 |
+
return None
|
165 |
+
elif os.path.isdir(filepath):
|
166 |
+
docs = []
|
167 |
+
for file in tqdm(os.listdir(filepath), desc="加载文件"):
|
168 |
+
fullfilepath = os.path.join(filepath, file)
|
169 |
+
try:
|
170 |
+
docs += load_file(fullfilepath, SENTENCE_SIZE)
|
171 |
+
loaded_files.append(fullfilepath)
|
172 |
+
except Exception as e:
|
173 |
+
print(e)
|
174 |
+
failed_files.append(file)
|
175 |
+
|
176 |
+
if len(failed_files) > 0:
|
177 |
+
print("以下文件未能成功加载:")
|
178 |
+
for file in failed_files:
|
179 |
+
print(f"{file}\n")
|
180 |
+
|
181 |
+
else:
|
182 |
+
docs = []
|
183 |
+
for file in filepath:
|
184 |
+
docs += load_file(file, SENTENCE_SIZE)
|
185 |
+
print(f"{file} 已成功加载")
|
186 |
+
loaded_files.append(file)
|
187 |
+
|
188 |
+
if len(docs) > 0:
|
189 |
+
print("文件加载完毕,正在生成向量库")
|
190 |
+
if vs_path and os.path.isdir(vs_path):
|
191 |
+
try:
|
192 |
+
self.vector_store = FAISS.load_local(vs_path, text2vec)
|
193 |
+
self.vector_store.add_documents(docs)
|
194 |
+
except:
|
195 |
+
self.vector_store = FAISS.from_documents(docs, text2vec)
|
196 |
+
else:
|
197 |
+
self.vector_store = FAISS.from_documents(docs, text2vec) # docs 为Document列表
|
198 |
+
|
199 |
+
self.vector_store.save_local(vs_path)
|
200 |
+
return vs_path, loaded_files
|
201 |
+
else:
|
202 |
+
raise RuntimeError("文件加载失败,请检查文件格式是否正确")
|
203 |
+
|
204 |
+
def get_loaded_file(self, vs_path):
|
205 |
+
ds = self.vector_store.docstore
|
206 |
+
return set([ds._dict[k].metadata['source'].split(vs_path)[-1] for k in ds._dict])
|
207 |
+
|
208 |
+
|
209 |
+
# query 查询内容
|
210 |
+
# vs_path 知识库路径
|
211 |
+
# chunk_conent 是否启用上下文关联
|
212 |
+
# score_threshold 搜索匹配score阈值
|
213 |
+
# vector_search_top_k 搜索知识库内容条数,默认搜索5条结果
|
214 |
+
# chunk_sizes 匹配单段内容的连接上下文长度
|
215 |
+
def get_knowledge_based_conent_test(self, query, vs_path, chunk_conent,
|
216 |
+
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
217 |
+
vector_search_top_k=VECTOR_SEARCH_TOP_K, chunk_size=CHUNK_SIZE,
|
218 |
+
text2vec=None):
|
219 |
+
self.vector_store = FAISS.load_local(vs_path, text2vec)
|
220 |
+
self.vector_store.chunk_conent = chunk_conent
|
221 |
+
self.vector_store.score_threshold = score_threshold
|
222 |
+
self.vector_store.chunk_size = chunk_size
|
223 |
+
|
224 |
+
embedding = self.vector_store.embedding_function.embed_query(query)
|
225 |
+
related_docs_with_score = similarity_search_with_score_by_vector(self.vector_store, embedding, k=vector_search_top_k)
|
226 |
+
|
227 |
+
if not related_docs_with_score:
|
228 |
+
response = {"query": query,
|
229 |
+
"source_documents": []}
|
230 |
+
return response, ""
|
231 |
+
# prompt = f"{query}. You should answer this question using information from following documents: \n\n"
|
232 |
+
prompt = f"{query}. 你必须利用以下文档中包含的信息回答这个问题: \n\n---\n\n"
|
233 |
+
prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)])
|
234 |
+
prompt += "\n\n---\n\n"
|
235 |
+
prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
|
236 |
+
# print(prompt)
|
237 |
+
response = {"query": query, "source_documents": related_docs_with_score}
|
238 |
+
return response, prompt
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
|
243 |
+
def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_conent, one_content_segmentation, text2vec):
|
244 |
+
for file in files:
|
245 |
+
assert os.path.exists(file), "输入文件不存在:" + file
|
246 |
+
import nltk
|
247 |
+
if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
248 |
+
local_doc_qa = LocalDocQA()
|
249 |
+
local_doc_qa.init_cfg()
|
250 |
+
filelist = []
|
251 |
+
if not os.path.exists(os.path.join(vs_path, vs_id)):
|
252 |
+
os.makedirs(os.path.join(vs_path, vs_id))
|
253 |
+
for file in files:
|
254 |
+
file_name = file.name if not isinstance(file, str) else file
|
255 |
+
filename = os.path.split(file_name)[-1]
|
256 |
+
shutil.copyfile(file_name, os.path.join(vs_path, vs_id, filename))
|
257 |
+
filelist.append(os.path.join(vs_path, vs_id, filename))
|
258 |
+
vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(filelist, os.path.join(vs_path, vs_id), sentence_size, text2vec)
|
259 |
+
|
260 |
+
if len(loaded_files):
|
261 |
+
file_status = f"已添加 {'、'.join([os.path.split(i)[-1] for i in loaded_files if i])} 内容至知识库,并已加载知识库,请开始提问"
|
262 |
+
else:
|
263 |
+
pass
|
264 |
+
# file_status = "文件未成功加载,请重新上传文件"
|
265 |
+
# print(file_status)
|
266 |
+
return local_doc_qa, vs_path
|
267 |
+
|
268 |
+
@Singleton
|
269 |
+
class knowledge_archive_interface():
|
270 |
+
def __init__(self) -> None:
|
271 |
+
self.threadLock = threading.Lock()
|
272 |
+
self.current_id = ""
|
273 |
+
self.kai_path = None
|
274 |
+
self.qa_handle = None
|
275 |
+
self.text2vec_large_chinese = None
|
276 |
+
|
277 |
+
def get_chinese_text2vec(self):
|
278 |
+
if self.text2vec_large_chinese is None:
|
279 |
+
# < -------------------预热文本向量化模组--------------- >
|
280 |
+
from toolbox import ProxyNetworkActivate
|
281 |
+
print('Checking Text2vec ...')
|
282 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
283 |
+
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
|
284 |
+
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
285 |
+
|
286 |
+
return self.text2vec_large_chinese
|
287 |
+
|
288 |
+
|
289 |
+
def feed_archive(self, file_manifest, vs_path, id="default"):
|
290 |
+
self.threadLock.acquire()
|
291 |
+
# import uuid
|
292 |
+
self.current_id = id
|
293 |
+
self.qa_handle, self.kai_path = construct_vector_store(
|
294 |
+
vs_id=self.current_id,
|
295 |
+
vs_path=vs_path,
|
296 |
+
files=file_manifest,
|
297 |
+
sentence_size=100,
|
298 |
+
history=[],
|
299 |
+
one_conent="",
|
300 |
+
one_content_segmentation="",
|
301 |
+
text2vec = self.get_chinese_text2vec(),
|
302 |
+
)
|
303 |
+
self.threadLock.release()
|
304 |
+
|
305 |
+
def get_current_archive_id(self):
|
306 |
+
return self.current_id
|
307 |
+
|
308 |
+
def get_loaded_file(self, vs_path):
|
309 |
+
return self.qa_handle.get_loaded_file(vs_path)
|
310 |
+
|
311 |
+
def answer_with_archive_by_id(self, txt, id, vs_path):
|
312 |
+
self.threadLock.acquire()
|
313 |
+
if not self.current_id == id:
|
314 |
+
self.current_id = id
|
315 |
+
self.qa_handle, self.kai_path = construct_vector_store(
|
316 |
+
vs_id=self.current_id,
|
317 |
+
vs_path=vs_path,
|
318 |
+
files=[],
|
319 |
+
sentence_size=100,
|
320 |
+
history=[],
|
321 |
+
one_conent="",
|
322 |
+
one_content_segmentation="",
|
323 |
+
text2vec = self.get_chinese_text2vec(),
|
324 |
+
)
|
325 |
+
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
326 |
+
VECTOR_SEARCH_TOP_K = 4
|
327 |
+
CHUNK_SIZE = 512
|
328 |
+
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
|
329 |
+
query = txt,
|
330 |
+
vs_path = self.kai_path,
|
331 |
+
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
332 |
+
vector_search_top_k=VECTOR_SEARCH_TOP_K,
|
333 |
+
chunk_conent=True,
|
334 |
+
chunk_size=CHUNK_SIZE,
|
335 |
+
text2vec = self.get_chinese_text2vec(),
|
336 |
+
)
|
337 |
+
self.threadLock.release()
|
338 |
+
return resp, prompt
|
crazy_functions/互动小游戏.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from toolbox import CatchException, update_ui, update_ui_lastest_msg
|
2 |
+
from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState
|
3 |
+
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
4 |
+
from request_llms.bridge_all import predict_no_ui_long_connection
|
5 |
+
from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing
|
6 |
+
|
7 |
+
@CatchException
|
8 |
+
def 随机小游戏(prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
|
9 |
+
from crazy_functions.game_fns.game_interactive_story import MiniGame_ResumeStory
|
10 |
+
# 清空历史
|
11 |
+
history = []
|
12 |
+
# 选择游戏
|
13 |
+
cls = MiniGame_ResumeStory
|
14 |
+
# 如果之前已经初始化了游戏实例,则继续该实例;否则重新初始化
|
15 |
+
state = cls.sync_state(chatbot,
|
16 |
+
llm_kwargs,
|
17 |
+
cls,
|
18 |
+
plugin_name='MiniGame_ResumeStory',
|
19 |
+
callback_fn='crazy_functions.互动小游戏->随机小游戏',
|
20 |
+
lock_plugin=True
|
21 |
+
)
|
22 |
+
yield from state.continue_game(prompt, chatbot, history)
|
23 |
+
|
24 |
+
|
25 |
+
@CatchException
|
26 |
+
def 随机小游戏1(prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
|
27 |
+
from crazy_functions.game_fns.game_ascii_art import MiniGame_ASCII_Art
|
28 |
+
# 清空历史
|
29 |
+
history = []
|
30 |
+
# 选择游戏
|
31 |
+
cls = MiniGame_ASCII_Art
|
32 |
+
# 如果之前已经初始化了游戏实例,则继续该实例;否则重新初始化
|
33 |
+
state = cls.sync_state(chatbot,
|
34 |
+
llm_kwargs,
|
35 |
+
cls,
|
36 |
+
plugin_name='MiniGame_ASCII_Art',
|
37 |
+
callback_fn='crazy_functions.互动小游戏->随机小游戏1',
|
38 |
+
lock_plugin=True
|
39 |
+
)
|
40 |
+
yield from state.continue_game(prompt, chatbot, history)
|
crazy_functions/知识库问答.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg, get_log_folder, get_user
|
2 |
+
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
|
3 |
+
|
4 |
+
install_msg ="""
|
5 |
+
|
6 |
+
1. python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
|
7 |
+
|
8 |
+
2. python -m pip install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade
|
9 |
+
|
10 |
+
3. python -m pip install unstructured[all-docs] --upgrade
|
11 |
+
|
12 |
+
4. python -c 'import nltk; nltk.download("punkt")'
|
13 |
+
"""
|
14 |
+
|
15 |
+
@CatchException
|
16 |
+
def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
|
17 |
+
"""
|
18 |
+
txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
|
19 |
+
llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行
|
20 |
+
plugin_kwargs 插件模型的参数,暂时没有用武之地
|
21 |
+
chatbot 聊天显示框的句柄,用于显示给用户
|
22 |
+
history 聊天历史,前情提要
|
23 |
+
system_prompt 给gpt的静默提醒
|
24 |
+
web_port 当前软件运行的端口号
|
25 |
+
"""
|
26 |
+
history = [] # 清空历史,以免输入溢出
|
27 |
+
|
28 |
+
# < --------------------读取参数--------------- >
|
29 |
+
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
|
30 |
+
kai_id = plugin_kwargs.get("advanced_arg", 'default')
|
31 |
+
|
32 |
+
chatbot.append((f"向`{kai_id}`知识库中添加文件。", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。"))
|
33 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
34 |
+
|
35 |
+
# resolve deps
|
36 |
+
try:
|
37 |
+
# from zh_langchain import construct_vector_store
|
38 |
+
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
39 |
+
from crazy_functions.vector_fns.vector_database import knowledge_archive_interface
|
40 |
+
except Exception as e:
|
41 |
+
chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
|
42 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
43 |
+
# from .crazy_utils import try_install_deps
|
44 |
+
# try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
|
45 |
+
# yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
|
46 |
+
return
|
47 |
+
|
48 |
+
# < --------------------读取文件--------------- >
|
49 |
+
file_manifest = []
|
50 |
+
spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"]
|
51 |
+
for sp in spl:
|
52 |
+
_, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}')
|
53 |
+
file_manifest += file_manifest_tmp
|
54 |
+
|
55 |
+
if len(file_manifest) == 0:
|
56 |
+
chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"])
|
57 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
58 |
+
return
|
59 |
+
|
60 |
+
# < -------------------预热文本向量化模组--------------- >
|
61 |
+
chatbot.append(['<br/>'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."])
|
62 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
63 |
+
print('Checking Text2vec ...')
|
64 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
65 |
+
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
|
66 |
+
HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
67 |
+
|
68 |
+
# < -------------------构建知识库--------------- >
|
69 |
+
chatbot.append(['<br/>'.join(file_manifest), "正在构建知识库..."])
|
70 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
71 |
+
print('Establishing knowledge archive ...')
|
72 |
+
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
|
73 |
+
kai = knowledge_archive_interface()
|
74 |
+
vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store')
|
75 |
+
kai.feed_archive(file_manifest=file_manifest, vs_path=vs_path, id=kai_id)
|
76 |
+
kai_files = kai.get_loaded_file(vs_path=vs_path)
|
77 |
+
kai_files = '<br/>'.join(kai_files)
|
78 |
+
# chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"])
|
79 |
+
# yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
80 |
+
# chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id()
|
81 |
+
# chatbot._cookies['lock_plugin'] = 'crazy_functions.知识库文件注入->读取知识库作答'
|
82 |
+
# chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"])
|
83 |
+
chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问���”插件进行知识库访问, 或者使用此插件继续上传更多文件。"])
|
84 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
|
85 |
+
|
86 |
+
@CatchException
|
87 |
+
def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1):
|
88 |
+
# resolve deps
|
89 |
+
try:
|
90 |
+
# from zh_langchain import construct_vector_store
|
91 |
+
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
92 |
+
from crazy_functions.vector_fns.vector_database import knowledge_archive_interface
|
93 |
+
except Exception as e:
|
94 |
+
chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
|
95 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
96 |
+
# from .crazy_utils import try_install_deps
|
97 |
+
# try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
|
98 |
+
# yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
|
99 |
+
return
|
100 |
+
|
101 |
+
# < ------------------- --------------- >
|
102 |
+
kai = knowledge_archive_interface()
|
103 |
+
|
104 |
+
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
|
105 |
+
kai_id = plugin_kwargs.get("advanced_arg", 'default')
|
106 |
+
vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store')
|
107 |
+
resp, prompt = kai.answer_with_archive_by_id(txt, kai_id, vs_path)
|
108 |
+
|
109 |
+
chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt))
|
110 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
|
111 |
+
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
112 |
+
inputs=prompt, inputs_show_user=txt,
|
113 |
+
llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
|
114 |
+
sys_prompt=system_prompt
|
115 |
+
)
|
116 |
+
history.extend((prompt, gpt_say))
|
117 |
+
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
|
docs/GithubAction+AllCapacityBeta
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacity --network=host --build-arg http_proxy=http://localhost:10881 --build-arg https_proxy=http://localhost:10881 .
|
2 |
+
# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacityBeta --network=host .
|
3 |
+
# docker run -it --net=host gpt-academic-all-capacity bash
|
4 |
+
|
5 |
+
# 从NVIDIA源,从而支持显卡(检查宿主的nvidia-smi中的cuda版本必须>=11.3)
|
6 |
+
FROM fuqingxu/11.3.1-runtime-ubuntu20.04-with-texlive:latest
|
7 |
+
|
8 |
+
# use python3 as the system default python
|
9 |
+
WORKDIR /gpt
|
10 |
+
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8
|
11 |
+
|
12 |
+
# # 非必要步骤,更换pip源 (以下三行,可以删除)
|
13 |
+
# RUN echo '[global]' > /etc/pip.conf && \
|
14 |
+
# echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \
|
15 |
+
# echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf
|
16 |
+
|
17 |
+
# 下载pytorch
|
18 |
+
RUN python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
|
19 |
+
# 准备pip依赖
|
20 |
+
RUN python3 -m pip install openai numpy arxiv rich
|
21 |
+
RUN python3 -m pip install colorama Markdown pygments pymupdf
|
22 |
+
RUN python3 -m pip install python-docx moviepy pdfminer
|
23 |
+
RUN python3 -m pip install zh_langchain==0.2.1 pypinyin
|
24 |
+
RUN python3 -m pip install rarfile py7zr
|
25 |
+
RUN python3 -m pip install aliyun-python-sdk-core==2.13.3 pyOpenSSL webrtcvad scipy git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git
|
26 |
+
# 下载分支
|
27 |
+
WORKDIR /gpt
|
28 |
+
RUN git clone --depth=1 https://github.com/binary-husky/gpt_academic.git
|
29 |
+
WORKDIR /gpt/gpt_academic
|
30 |
+
RUN git clone --depth=1 https://github.com/OpenLMLab/MOSS.git request_llms/moss
|
31 |
+
|
32 |
+
RUN python3 -m pip install -r requirements.txt
|
33 |
+
RUN python3 -m pip install -r request_llms/requirements_moss.txt
|
34 |
+
RUN python3 -m pip install -r request_llms/requirements_qwen.txt
|
35 |
+
RUN python3 -m pip install -r request_llms/requirements_chatglm.txt
|
36 |
+
RUN python3 -m pip install -r request_llms/requirements_newbing.txt
|
37 |
+
RUN python3 -m pip install nougat-ocr
|
38 |
+
|
39 |
+
# 预热Tiktoken模块
|
40 |
+
RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'
|
41 |
+
|
42 |
+
# 安装知识库插件的额外依赖
|
43 |
+
RUN apt-get update && apt-get install libgl1 -y
|
44 |
+
RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade
|
45 |
+
RUN pip3 install unstructured[all-docs] --upgrade
|
46 |
+
RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()'
|
47 |
+
RUN rm -rf /usr/local/lib/python3.8/dist-packages/tests
|
48 |
+
|
49 |
+
|
50 |
+
# COPY .cache /root/.cache
|
51 |
+
# COPY config_private.py config_private.py
|
52 |
+
# 启动
|
53 |
+
CMD ["python3", "-u", "main.py"]
|
docs/GithubAction+NoLocal+Vectordb
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 此Dockerfile适用于“无本地模型”的环境构建,如果需要使用chatglm等本地模型,请参考 docs/Dockerfile+ChatGLM
|
2 |
+
# 如何构建: 先修改 `config.py`, 然后 docker build -t gpt-academic-nolocal-vs -f docs/GithubAction+NoLocal+Vectordb .
|
3 |
+
# 如何运行: docker run --rm -it --net=host gpt-academic-nolocal-vs
|
4 |
+
FROM python:3.11
|
5 |
+
|
6 |
+
# 指定路径
|
7 |
+
WORKDIR /gpt
|
8 |
+
|
9 |
+
# 装载项目文件
|
10 |
+
COPY . .
|
11 |
+
|
12 |
+
# 安装依赖
|
13 |
+
RUN pip3 install -r requirements.txt
|
14 |
+
|
15 |
+
# 安装知识库插件的额外依赖
|
16 |
+
RUN apt-get update && apt-get install libgl1 -y
|
17 |
+
RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
|
18 |
+
RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade
|
19 |
+
RUN pip3 install unstructured[all-docs] --upgrade
|
20 |
+
RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()'
|
21 |
+
|
22 |
+
# 可选步骤,用于预热模块
|
23 |
+
RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'
|
24 |
+
|
25 |
+
# 启动
|
26 |
+
CMD ["python3", "-u", "main.py"]
|
request_llms/bridge_qwen_local.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name = "Qwen_Local"
|
2 |
+
cmd_to_install = "`pip install -r request_llms/requirements_qwen_local.txt`"
|
3 |
+
|
4 |
+
from toolbox import ProxyNetworkActivate, get_conf
|
5 |
+
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
# ------------------------------------------------------------------------------------------------------------------------
|
10 |
+
# 🔌💻 Local Model
|
11 |
+
# ------------------------------------------------------------------------------------------------------------------------
|
12 |
+
class GetQwenLMHandle(LocalLLMHandle):
|
13 |
+
|
14 |
+
def load_model_info(self):
|
15 |
+
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
16 |
+
self.model_name = model_name
|
17 |
+
self.cmd_to_install = cmd_to_install
|
18 |
+
|
19 |
+
def load_model_and_tokenizer(self):
|
20 |
+
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
21 |
+
# from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
22 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
23 |
+
from transformers.generation import GenerationConfig
|
24 |
+
with ProxyNetworkActivate('Download_LLM'):
|
25 |
+
model_id = get_conf('QWEN_LOCAL_MODEL_SELECTION')
|
26 |
+
self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, resume_download=True)
|
27 |
+
# use fp16
|
28 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval()
|
29 |
+
model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
30 |
+
self._model = model
|
31 |
+
|
32 |
+
return self._model, self._tokenizer
|
33 |
+
|
34 |
+
def llm_stream_generator(self, **kwargs):
|
35 |
+
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
36 |
+
def adaptor(kwargs):
|
37 |
+
query = kwargs['query']
|
38 |
+
max_length = kwargs['max_length']
|
39 |
+
top_p = kwargs['top_p']
|
40 |
+
temperature = kwargs['temperature']
|
41 |
+
history = kwargs['history']
|
42 |
+
return query, max_length, top_p, temperature, history
|
43 |
+
|
44 |
+
query, max_length, top_p, temperature, history = adaptor(kwargs)
|
45 |
+
|
46 |
+
for response in self._model.chat_stream(self._tokenizer, query, history=history):
|
47 |
+
yield response
|
48 |
+
|
49 |
+
def try_to_import_special_deps(self, **kwargs):
|
50 |
+
# import something that will raise error if the user does not install requirement_*.txt
|
51 |
+
# 🏃♂️🏃♂️🏃♂️ 主进程执行
|
52 |
+
import importlib
|
53 |
+
importlib.import_module('modelscope')
|
54 |
+
|
55 |
+
|
56 |
+
# ------------------------------------------------------------------------------------------------------------------------
|
57 |
+
# 🔌💻 GPT-Academic Interface
|
58 |
+
# ------------------------------------------------------------------------------------------------------------------------
|
59 |
+
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetQwenLMHandle, model_name)
|
request_llms/com_qwenapi.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from http import HTTPStatus
|
2 |
+
from toolbox import get_conf
|
3 |
+
import threading
|
4 |
+
import logging
|
5 |
+
|
6 |
+
timeout_bot_msg = '[Local Message] Request timeout. Network error.'
|
7 |
+
|
8 |
+
class QwenRequestInstance():
|
9 |
+
def __init__(self):
|
10 |
+
import dashscope
|
11 |
+
self.time_to_yield_event = threading.Event()
|
12 |
+
self.time_to_exit_event = threading.Event()
|
13 |
+
self.result_buf = ""
|
14 |
+
|
15 |
+
def validate_key():
|
16 |
+
DASHSCOPE_API_KEY = get_conf("DASHSCOPE_API_KEY")
|
17 |
+
if DASHSCOPE_API_KEY == '': return False
|
18 |
+
return True
|
19 |
+
|
20 |
+
if not validate_key():
|
21 |
+
raise RuntimeError('请配置 DASHSCOPE_API_KEY')
|
22 |
+
dashscope.api_key = get_conf("DASHSCOPE_API_KEY")
|
23 |
+
|
24 |
+
|
25 |
+
def generate(self, inputs, llm_kwargs, history, system_prompt):
|
26 |
+
# import _thread as thread
|
27 |
+
from dashscope import Generation
|
28 |
+
QWEN_MODEL = {
|
29 |
+
'qwen-turbo': Generation.Models.qwen_turbo,
|
30 |
+
'qwen-plus': Generation.Models.qwen_plus,
|
31 |
+
'qwen-max': Generation.Models.qwen_max,
|
32 |
+
}[llm_kwargs['llm_model']]
|
33 |
+
top_p = llm_kwargs.get('top_p', 0.8)
|
34 |
+
if top_p == 0: top_p += 1e-5
|
35 |
+
if top_p == 1: top_p -= 1e-5
|
36 |
+
|
37 |
+
self.result_buf = ""
|
38 |
+
responses = Generation.call(
|
39 |
+
model=QWEN_MODEL,
|
40 |
+
messages=generate_message_payload(inputs, llm_kwargs, history, system_prompt),
|
41 |
+
top_p=top_p,
|
42 |
+
temperature=llm_kwargs.get('temperature', 1.0),
|
43 |
+
result_format='message',
|
44 |
+
stream=True,
|
45 |
+
incremental_output=True
|
46 |
+
)
|
47 |
+
|
48 |
+
for response in responses:
|
49 |
+
if response.status_code == HTTPStatus.OK:
|
50 |
+
if response.output.choices[0].finish_reason == 'stop':
|
51 |
+
yield self.result_buf
|
52 |
+
break
|
53 |
+
elif response.output.choices[0].finish_reason == 'length':
|
54 |
+
self.result_buf += "[Local Message] 生成长度过长,后续输出被截断"
|
55 |
+
yield self.result_buf
|
56 |
+
break
|
57 |
+
else:
|
58 |
+
self.result_buf += response.output.choices[0].message.content
|
59 |
+
yield self.result_buf
|
60 |
+
else:
|
61 |
+
self.result_buf += f"[Local Message] 请求错误:状态码:{response.status_code},错误码:{response.code},消息:{response.message}"
|
62 |
+
yield self.result_buf
|
63 |
+
break
|
64 |
+
logging.info(f'[raw_input] {inputs}')
|
65 |
+
logging.info(f'[response] {self.result_buf}')
|
66 |
+
return self.result_buf
|
67 |
+
|
68 |
+
|
69 |
+
def generate_message_payload(inputs, llm_kwargs, history, system_prompt):
|
70 |
+
conversation_cnt = len(history) // 2
|
71 |
+
if system_prompt == '': system_prompt = 'Hello!'
|
72 |
+
messages = [{"role": "user", "content": system_prompt}, {"role": "assistant", "content": "Certainly!"}]
|
73 |
+
if conversation_cnt:
|
74 |
+
for index in range(0, 2*conversation_cnt, 2):
|
75 |
+
what_i_have_asked = {}
|
76 |
+
what_i_have_asked["role"] = "user"
|
77 |
+
what_i_have_asked["content"] = history[index]
|
78 |
+
what_gpt_answer = {}
|
79 |
+
what_gpt_answer["role"] = "assistant"
|
80 |
+
what_gpt_answer["content"] = history[index+1]
|
81 |
+
if what_i_have_asked["content"] != "":
|
82 |
+
if what_gpt_answer["content"] == "":
|
83 |
+
continue
|
84 |
+
if what_gpt_answer["content"] == timeout_bot_msg:
|
85 |
+
continue
|
86 |
+
messages.append(what_i_have_asked)
|
87 |
+
messages.append(what_gpt_answer)
|
88 |
+
else:
|
89 |
+
messages[-1]['content'] = what_gpt_answer['content']
|
90 |
+
what_i_ask_now = {}
|
91 |
+
what_i_ask_now["role"] = "user"
|
92 |
+
what_i_ask_now["content"] = inputs
|
93 |
+
messages.append(what_i_ask_now)
|
94 |
+
return messages
|
request_llms/requirements_qwen_local.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
modelscope
|
2 |
+
transformers_stream_generator
|
3 |
+
auto-gptq
|
4 |
+
optimum
|
5 |
+
urllib3<2
|
tests/test_vector_plugins.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
对项目中的各个插件进行测试。运行方法:直接运行 python tests/test_plugins.py
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
import os, sys
|
7 |
+
def validate_path(): dir_name = os.path.dirname(__file__); root_dir_assume = os.path.abspath(dir_name + '/..'); os.chdir(root_dir_assume); sys.path.append(root_dir_assume)
|
8 |
+
validate_path() # 返回项目根路径
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
from tests.test_utils import plugin_test
|
12 |
+
|
13 |
+
plugin_test(plugin='crazy_functions.知识库问答->知识库文件注入', main_input="./README.md")
|
14 |
+
|
15 |
+
plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="What is the installation method?")
|
16 |
+
|
17 |
+
plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="远程云服务器部署?")
|
themes/cookies.py
ADDED
File without changes
|