Spaces:
Build error
Build error
修复完成后的文件显示问题
Browse files- crazy_functions/crazy_utils.py +73 -0
- crazy_functions/批量翻译PDF文档_多线程.py +33 -124
crazy_functions/crazy_utils.py
CHANGED
@@ -1,6 +1,79 @@
|
|
1 |
|
2 |
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
5 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
6 |
if get_token_fn(txt_tocut) <= limit:
|
|
|
1 |
|
2 |
|
3 |
|
4 |
+
def request_gpt_model_in_new_thread_with_ui_alive(inputs, inputs_show_user, top_p, temperature, chatbot, history, sys_prompt, refresh_interval=0.2):
|
5 |
+
import time
|
6 |
+
from concurrent.futures import ThreadPoolExecutor
|
7 |
+
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
8 |
+
# 用户反馈
|
9 |
+
chatbot.append([inputs_show_user, ""]); msg = '正常'
|
10 |
+
yield chatbot, [], msg
|
11 |
+
executor = ThreadPoolExecutor(max_workers=16)
|
12 |
+
mutable = ["", time.time()]
|
13 |
+
future = executor.submit(lambda:
|
14 |
+
predict_no_ui_long_connection(inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable)
|
15 |
+
)
|
16 |
+
while True:
|
17 |
+
# yield一次以刷新前端页面
|
18 |
+
time.sleep(refresh_interval)
|
19 |
+
# “喂狗”(看门狗)
|
20 |
+
mutable[1] = time.time()
|
21 |
+
if future.done(): break
|
22 |
+
chatbot[-1] = [chatbot[-1][0], mutable[0]]; msg = "正常"
|
23 |
+
yield chatbot, [], msg
|
24 |
+
return future.result()
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(inputs_array, inputs_show_user_array, top_p, temperature, chatbot, history_array, sys_prompt_array, refresh_interval=0.2, max_workers=10, scroller_max_len=30):
|
30 |
+
import time
|
31 |
+
from concurrent.futures import ThreadPoolExecutor
|
32 |
+
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
33 |
+
assert len(inputs_array) == len(history_array)
|
34 |
+
assert len(inputs_array) == len(sys_prompt_array)
|
35 |
+
executor = ThreadPoolExecutor(max_workers=max_workers)
|
36 |
+
n_frag = len(inputs_array)
|
37 |
+
# 用户反馈
|
38 |
+
chatbot.append(["请开始多线程操作。", ""]); msg = '正常'
|
39 |
+
yield chatbot, [], msg
|
40 |
+
# 异步原子
|
41 |
+
mutable = [["", time.time()] for _ in range(n_frag)]
|
42 |
+
def _req_gpt(index, inputs, history, sys_prompt):
|
43 |
+
gpt_say = predict_no_ui_long_connection(
|
44 |
+
inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable[index]
|
45 |
+
)
|
46 |
+
return gpt_say
|
47 |
+
# 异步任务开始
|
48 |
+
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
49 |
+
cnt = 0
|
50 |
+
while True:
|
51 |
+
# yield一次以刷新前端页面
|
52 |
+
time.sleep(refresh_interval); cnt += 1
|
53 |
+
worker_done = [h.done() for h in futures]
|
54 |
+
if all(worker_done): executor.shutdown(); break
|
55 |
+
# 更好的UI视觉效果
|
56 |
+
observe_win = []
|
57 |
+
# 每个线程都要“喂狗”(看门狗)
|
58 |
+
for thread_index, _ in enumerate(worker_done): mutable[thread_index][1] = time.time()
|
59 |
+
# 在前端打印些好玩的东西
|
60 |
+
for thread_index, _ in enumerate(worker_done):
|
61 |
+
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
|
62 |
+
replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
|
63 |
+
observe_win.append(print_something_really_funny)
|
64 |
+
stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
|
65 |
+
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
|
66 |
+
yield chatbot, [], msg
|
67 |
+
# 异步任务结束
|
68 |
+
gpt_response_collection = []
|
69 |
+
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
70 |
+
gpt_res = f.result()
|
71 |
+
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
72 |
+
return gpt_response_collection
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
78 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
79 |
if get_token_fn(txt_tocut) <= limit:
|
crazy_functions/批量翻译PDF文档_多线程.py
CHANGED
@@ -1,66 +1,25 @@
|
|
1 |
-
from toolbox import CatchException, report_execption, write_results_to_file
|
2 |
-
import
|
3 |
-
import
|
4 |
|
5 |
-
|
6 |
-
def is_paragraph_break(match):
|
7 |
-
"""
|
8 |
-
根据给定的匹配结果来判断换行符是否表示段落分隔。
|
9 |
-
如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
|
10 |
-
也可以根据之前的内容长度来判断段落是否已经足够长。
|
11 |
-
"""
|
12 |
-
prev_char, next_char = match.groups()
|
13 |
-
|
14 |
-
# 句子结束标志
|
15 |
-
sentence_endings = ".!?"
|
16 |
-
|
17 |
-
# 设定一个最小段落长度阈值
|
18 |
-
min_paragraph_length = 140
|
19 |
-
|
20 |
-
if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
|
21 |
-
return "\n\n"
|
22 |
-
else:
|
23 |
-
return " "
|
24 |
-
|
25 |
-
|
26 |
-
def normalize_text(text):
|
27 |
-
"""
|
28 |
-
通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
|
29 |
-
例如,将连字 "fi" 转换为 "f" 和 "i"。
|
30 |
-
"""
|
31 |
-
# 对文本进行归一化处理,分解连字
|
32 |
-
normalized_text = unicodedata.normalize("NFKD", text)
|
33 |
-
|
34 |
-
# 替换其他特殊字符
|
35 |
-
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
36 |
-
|
37 |
-
return cleaned_text
|
38 |
-
|
39 |
-
|
40 |
-
def clean_text(raw_text):
|
41 |
"""
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
"""
|
47 |
-
# 对文本进行归一化处理
|
48 |
-
normalized_text = normalize_text(raw_text)
|
49 |
-
|
50 |
-
# 替换跨行的连词
|
51 |
-
text = re.sub(r'(\w+-\n\w+)',
|
52 |
-
lambda m: m.group(1).replace('-\n', ''), normalized_text)
|
53 |
-
|
54 |
-
# 根据前后相邻字符的特点,找到原文本中的换行符
|
55 |
-
newlines = re.compile(r'(\S)\n(\S)')
|
56 |
-
|
57 |
-
# 根据 heuristic 规则,用空格或段落分隔符替换原换行符
|
58 |
-
final_text = re.sub(newlines, lambda m: m.group(
|
59 |
-
1) + is_paragraph_break(m) + m.group(2), text)
|
60 |
-
|
61 |
-
return final_text.strip()
|
62 |
-
|
63 |
-
def read_and_clean_pdf_text(fp):
|
64 |
import fitz, re
|
65 |
import numpy as np
|
66 |
# file_content = ""
|
@@ -170,69 +129,7 @@ def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
170 |
yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt)
|
171 |
|
172 |
|
173 |
-
def request_gpt_model_in_new_thread_with_ui_alive(inputs, inputs_show_user, top_p, temperature, chatbot, history, sys_prompt, refresh_interval=0.2):
|
174 |
-
import time
|
175 |
-
from concurrent.futures import ThreadPoolExecutor
|
176 |
-
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
177 |
-
# 用户反馈
|
178 |
-
chatbot.append([inputs_show_user, ""]); msg = '正常'
|
179 |
-
yield chatbot, [], msg
|
180 |
-
executor = ThreadPoolExecutor(max_workers=16)
|
181 |
-
mutable = ["", time.time()]
|
182 |
-
future = executor.submit(lambda:
|
183 |
-
predict_no_ui_long_connection(inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable)
|
184 |
-
)
|
185 |
-
while True:
|
186 |
-
# yield一次以刷新前端页面
|
187 |
-
time.sleep(refresh_interval)
|
188 |
-
# “喂狗”(看门狗)
|
189 |
-
mutable[1] = time.time()
|
190 |
-
if future.done(): break
|
191 |
-
chatbot[-1] = [chatbot[-1][0], mutable[0]]; msg = "正常"
|
192 |
-
yield chatbot, [], msg
|
193 |
-
return future.result()
|
194 |
|
195 |
-
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(inputs_array, inputs_show_user_array, top_p, temperature, chatbot, history_array, sys_prompt_array, refresh_interval=0.2, max_workers=10, scroller_max_len=30):
|
196 |
-
import time
|
197 |
-
from concurrent.futures import ThreadPoolExecutor
|
198 |
-
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
199 |
-
assert len(inputs_array) == len(history_array)
|
200 |
-
assert len(inputs_array) == len(sys_prompt_array)
|
201 |
-
executor = ThreadPoolExecutor(max_workers=max_workers)
|
202 |
-
n_frag = len(inputs_array)
|
203 |
-
# 异步原子
|
204 |
-
mutable = [["", time.time()] for _ in range(n_frag)]
|
205 |
-
def _req_gpt(index, inputs, history, sys_prompt):
|
206 |
-
gpt_say = predict_no_ui_long_connection(
|
207 |
-
inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable[index]
|
208 |
-
)
|
209 |
-
return gpt_say
|
210 |
-
# 异步任务开始
|
211 |
-
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
212 |
-
cnt = 0
|
213 |
-
while True:
|
214 |
-
# yield一次以刷新前端页面
|
215 |
-
time.sleep(refresh_interval); cnt += 1
|
216 |
-
worker_done = [h.done() for h in futures]
|
217 |
-
if all(worker_done): executor.shutdown(); break
|
218 |
-
# 更好的UI视觉效果
|
219 |
-
observe_win = []
|
220 |
-
# 每个线程都要“喂狗”(看门狗)
|
221 |
-
for thread_index, _ in enumerate(worker_done): mutable[thread_index][1] = time.time()
|
222 |
-
# 在前端打印些好玩的东西
|
223 |
-
for thread_index, _ in enumerate(worker_done):
|
224 |
-
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
|
225 |
-
replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
|
226 |
-
observe_win.append(print_something_really_funny)
|
227 |
-
stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
|
228 |
-
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
|
229 |
-
yield chatbot, [], msg
|
230 |
-
# 异步任务结束
|
231 |
-
gpt_response_collection = []
|
232 |
-
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
233 |
-
gpt_res = f.result()
|
234 |
-
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
235 |
-
return gpt_response_collection
|
236 |
|
237 |
def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt):
|
238 |
import time
|
@@ -241,7 +138,7 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
|
|
241 |
import fitz
|
242 |
import tiktoken
|
243 |
TOKEN_LIMIT_PER_FRAGMENT = 1600
|
244 |
-
|
245 |
for index, fp in enumerate(file_manifest):
|
246 |
# 读取PDF文件
|
247 |
file_content, page_one = read_and_clean_pdf_text(fp)
|
@@ -277,7 +174,19 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
|
|
277 |
|
278 |
final = ["", paper_meta_info + '\n\n---\n\n---\n\n---\n\n']
|
279 |
final.extend(gpt_response_collection)
|
280 |
-
|
|
|
|
|
281 |
chatbot.append((f"{fp}完成了吗?", res)); msg = "完成"
|
282 |
yield chatbot, history, msg
|
283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from toolbox import CatchException, report_execption, write_results_to_file
|
2 |
+
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
3 |
+
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
4 |
|
5 |
+
def read_and_clean_pdf_text(fp):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"""
|
7 |
+
**输入参数说明**
|
8 |
+
- `fp`:需要读取和清理文本的pdf文件路径
|
9 |
+
|
10 |
+
**输出参数说明**
|
11 |
+
- `meta_txt`:清理后的文本内容字符串
|
12 |
+
- `page_one_meta`:第一页清理后的文本内容列表
|
13 |
+
|
14 |
+
**函数功能**
|
15 |
+
读取pdf文件并清理其中的文本内容,清理规则包括:
|
16 |
+
- 提取所有块元的文本信息,并合并为一个字符串
|
17 |
+
- 去除短块(字符数小于100)并替换为回车符
|
18 |
+
- 清理多余的空行
|
19 |
+
- 合并小写字母开头的段落块并替换为空格
|
20 |
+
- 清除重复的换行
|
21 |
+
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
22 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import fitz, re
|
24 |
import numpy as np
|
25 |
# file_content = ""
|
|
|
129 |
yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt)
|
130 |
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt):
|
135 |
import time
|
|
|
138 |
import fitz
|
139 |
import tiktoken
|
140 |
TOKEN_LIMIT_PER_FRAGMENT = 1600
|
141 |
+
generated_conclusion_files = []
|
142 |
for index, fp in enumerate(file_manifest):
|
143 |
# 读取PDF文件
|
144 |
file_content, page_one = read_and_clean_pdf_text(fp)
|
|
|
174 |
|
175 |
final = ["", paper_meta_info + '\n\n---\n\n---\n\n---\n\n']
|
176 |
final.extend(gpt_response_collection)
|
177 |
+
create_report_file_name = f"{os.path.basename(fp)}.trans.md"
|
178 |
+
res = write_results_to_file(final, file_name=create_report_file_name)
|
179 |
+
generated_conclusion_files.append(f'./gpt_log/{create_report_file_name}')
|
180 |
chatbot.append((f"{fp}完成了吗?", res)); msg = "完成"
|
181 |
yield chatbot, history, msg
|
182 |
|
183 |
+
# 准备文件的下载
|
184 |
+
import shutil
|
185 |
+
for pdf_path in generated_conclusion_files:
|
186 |
+
# 重命名文件
|
187 |
+
rename_file = f'./gpt_log/总结论文-{os.path.basename(pdf_path)}'
|
188 |
+
if os.path.exists(rename_file): os.remove(rename_file)
|
189 |
+
shutil.copyfile(pdf_path, rename_file);
|
190 |
+
if os.path.exists(pdf_path): os.remove(pdf_path)
|
191 |
+
chatbot.append(("给出输出文件清单", str(generated_conclusion_files)))
|
192 |
+
yield chatbot, history, msg
|