File size: 13,400 Bytes
e8cf757
ea031ab
e8cf757
 
fc33168
0b3f7b8
e8cf757
dcaa7a1
fc33168
 
e8cf757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcaa7a1
fc33168
0b3f7b8
dcaa7a1
fc33168
 
 
 
 
 
 
 
 
 
 
 
 
 
dcaa7a1
 
 
 
fc33168
 
 
85d85d8
dcaa7a1
 
fc33168
 
 
 
 
 
 
 
 
 
85d85d8
0b3f7b8
 
 
 
 
 
 
fc33168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcaa7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b3f7b8
 
 
 
dcaa7a1
 
 
 
 
 
 
 
 
0b3f7b8
dcaa7a1
 
 
 
fc33168
 
 
 
85d85d8
dcaa7a1
0b3f7b8
dcaa7a1
0666fec
dcaa7a1
 
 
 
 
 
49a6ff6
0666fec
dcaa7a1
 
 
0b3f7b8
 
dcaa7a1
 
 
079916f
0666fec
dcaa7a1
 
 
 
 
 
 
 
 
 
 
 
 
0666fec
dcaa7a1
 
 
 
 
 
 
 
 
 
0666fec
dcaa7a1
 
 
0666fec
dcaa7a1
 
0666fec
dcaa7a1
 
85d85d8
e8cf757
dcaa7a1
fc33168
85d85d8
 
fc33168
85d85d8
dcaa7a1
e6cf553
 
0b3f7b8
 
85d85d8
 
 
fc33168
 
 
 
85d85d8
 
0b3f7b8
 
0666fec
85d85d8
 
 
fc33168
85d85d8
 
0b3f7b8
fc33168
 
0666fec
85d85d8
 
0b3f7b8
fc33168
0b3f7b8
85d85d8
 
fc33168
 
 
 
 
 
 
d32a52c
e8cf757
 
fc33168
 
49a6ff6
0b3f7b8
929c0af
85d85d8
e8cf757
 
 
 
 
0b3f7b8
 
 
 
 
e8cf757
929c0af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
from toolbox import CatchException, report_execption, write_results_to_file
from toolbox import update_ui
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from colorful import *

def read_and_clean_pdf_text(fp):
    """
    这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好,不建议任何人去读这个函数

    **输入参数说明**
    - `fp`:需要读取和清理文本的pdf文件路径

    **输出参数说明**
    - `meta_txt`:清理后的文本内容字符串
    - `page_one_meta`:第一页清理后的文本内容列表

    **函数功能**
    读取pdf文件并清理其中的文本内容,清理规则包括:
    - 提取所有块元的文本信息,并合并为一个字符串
    - 去除短块(字符数小于100)并替换为回车符
    - 清理多余的空行
    - 合并小写字母开头的段落块并替换为空格
    - 清除重复的换行
    - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
    """
    import fitz, copy
    import re
    import numpy as np
    fc = 0
    fs = 1
    fb = 2
    REMOVE_FOOT_NOTE = True
    REMOVE_FOOT_FFSIZE_PERCENT = 0.95 
    def primary_ffsize(l):
        fsize_statiscs = {}
        for wtf in l['spans']:
            if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
            fsize_statiscs[wtf['size']] += len(wtf['text'])
        return max(fsize_statiscs, key=fsize_statiscs.get)
        
    def ffsize_same(a,b):
        return abs((a-b)/max(a,b)) < 0.02
    # file_content = ""
    with fitz.open(fp) as doc:
        meta_txt = []
        meta_font = []

        meta_line = []
        meta_span = []
        for index, page in enumerate(doc):
            # file_content += page.get_text()
            text_areas = page.get_text("dict")  # 获取页面上的文本信息
            for t in text_areas['blocks']:
                if 'lines' in t:
                    pf = 998
                    for l in t['lines']:
                        txt_line = "".join([wtf['text'] for wtf in l['spans']])
                        pf = primary_ffsize(l)
                        meta_line.append([txt_line, pf, l['bbox'], l])
                        for wtf in l['spans']: # for l in t['lines']:
                            meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
                    # meta_line.append(["NEW_BLOCK", pf])
            # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
            meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
                '- ', '') for t in text_areas['blocks'] if 'lines' in t])
            meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
                             for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
            if index == 0:
                page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
                    '- ', '') for t in text_areas['blocks'] if 'lines' in t]
        # 获取正文主字体
        fsize_statiscs = {}
        for span in meta_span:
            if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
            fsize_statiscs[span[1]] += span[2]
        main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
        if REMOVE_FOOT_NOTE:
            give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT

        # 切分和重新整合
        mega_sec = []
        sec = []
        for index, line in enumerate(meta_line):
            if index == 0: 
                sec.append(line[fc])
                continue
            if REMOVE_FOOT_NOTE:
                if meta_line[index][fs] <= give_up_fize_threshold:
                    continue
            if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
                # 尝试识别段落
                if meta_line[index][fc].endswith('.') and\
                    (meta_line[index-1][fc] != 'NEW_BLOCK') and \
                    (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
                    sec[-1] += line[fc]
                    sec[-1] += "\n\n"
                else:
                    sec[-1] += " "
                    sec[-1] += line[fc]
            else:
                if (index+1 < len(meta_line)) and \
                    meta_line[index][fs] > main_fsize:
                    # 单行 + 字体大
                    mega_sec.append(copy.deepcopy(sec))
                    sec = []
                    sec.append("# " + line[fc])
                else:
                    # 尝试识别section
                    if meta_line[index-1][fs] > meta_line[index][fs]:
                        sec.append("\n" + line[fc])
                    else:
                        sec.append(line[fc])
        mega_sec.append(copy.deepcopy(sec))

        finals = []
        for ms in mega_sec:
            final = " ".join(ms)
            final = final.replace('- ', ' ')
            finals.append(final)
        meta_txt = finals

        def 把字符太少的块清除为回车(meta_txt):
            for index, block_txt in enumerate(meta_txt):
                if len(block_txt) < 100:
                    meta_txt[index] = '\n'
            return meta_txt
        meta_txt = 把字符太少的块清除为回车(meta_txt)

        def 清理多余的空行(meta_txt):
            for index in reversed(range(1, len(meta_txt))):
                if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
                    meta_txt.pop(index)
            return meta_txt
        meta_txt = 清理多余的空行(meta_txt)

        def 合并小写开头的段落块(meta_txt):
            def starts_with_lowercase_word(s):
                pattern = r"^[a-z]+"
                match = re.match(pattern, s)
                if match:
                    return True
                else:
                    return False
            for _ in range(100):
                for index, block_txt in enumerate(meta_txt):
                    if starts_with_lowercase_word(block_txt):
                        if meta_txt[index-1] != '\n':
                            meta_txt[index-1] += ' '
                        else:
                            meta_txt[index-1] = ''
                        meta_txt[index-1] += meta_txt[index]
                        meta_txt[index] = '\n'
            return meta_txt
        meta_txt = 合并小写开头的段落块(meta_txt)
        meta_txt = 清理多余的空行(meta_txt)

        meta_txt = '\n'.join(meta_txt)
        # 清除重复的换行
        for _ in range(5):
            meta_txt = meta_txt.replace('\n\n', '\n')

        # 换行 -> 双换行
        meta_txt = meta_txt.replace('\n', '\n\n')

        for f in finals:
            print亮黄(f)
            print亮绿('***************************')

    return meta_txt, page_one_meta


@CatchException
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
    import glob
    import os

    # 基本信息:功能、贡献者
    chatbot.append([
        "函数插件功能?",
        "批量总结PDF文档。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 尝试导入依赖,如果缺少依赖,则给出安装建议
    try:
        import fitz
        import tiktoken
    except:
        report_execption(chatbot, history,
                         a=f"解析项目: {txt}",
                         b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf tiktoken```。")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 清空历史,以免输入溢出
    history = []

    # 检测输入参数,如没有给定输入参数,直接退出
    if os.path.exists(txt):
        project_folder = txt
    else:
        if txt == "":
            txt = '空空如也的输入栏'
        report_execption(chatbot, history,
                         a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 搜索需要处理的文件清单
    file_manifest = [f for f in glob.glob(
        f'{project_folder}/**/*.pdf', recursive=True)]

    # 如果没找到任何文件
    if len(file_manifest) == 0:
        report_execption(chatbot, history,
                         a=f"解析项目: {txt}", b=f"找不到任何.tex或.pdf文件: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 开始正式执行任务
    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt)


def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt):
    import os
    import tiktoken
    TOKEN_LIMIT_PER_FRAGMENT = 1600
    generated_conclusion_files = []
    for index, fp in enumerate(file_manifest):

        # 读取PDF文件
        file_content, page_one = read_and_clean_pdf_text(fp)

        # 递归地切割PDF文件
        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
        from toolbox import get_conf
        enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
        def get_token_num(txt): return len(enc.encode(txt))
        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
            txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
        page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
            txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)

        # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
        
        # 单线,获取文章meta信息
        paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive(
            inputs=f"以下是一篇学术论文的基础信息,请从中提取出“标题”、“收录会议或期刊”、“作者”、“摘要”、“编号”、“作者邮箱”这六个部分。请用markdown格式输出,最后用中文翻译摘要部分。请提取:{paper_meta}",
            inputs_show_user=f"请从{fp}中提取出“标题”、“收录会议或期刊”等基本信息。",
            llm_kwargs=llm_kwargs,
            chatbot=chatbot, history=[],
            sys_prompt="Your job is to collect information from materials。",
        )

        # 多线,翻译
        gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
            inputs_array=[
                f"以下是你需要翻译的论文片段:\n{frag}" for frag in paper_fragments],
            inputs_show_user_array=[f"\n---\n 原文: \n\n {frag.replace('#', '')}  \n---\n 翻译:\n " for frag in paper_fragments],
            llm_kwargs=llm_kwargs,
            chatbot=chatbot,
            history_array=[[paper_meta] for _ in paper_fragments],
            sys_prompt_array=[
                "请你作为一个学术翻译,负责把学术论文的片段准确翻译成中文。" for _ in paper_fragments],
            max_workers=16  # OpenAI所允许的最大并行过载
        )

        # 整理报告的格式
        for i,k in enumerate(gpt_response_collection): 
            if i%2==0:
                gpt_response_collection[i] = f"\n\n---\n\n ## 原文[{i//2}/{len(gpt_response_collection)//2}]: \n\n {paper_fragments[i//2].replace('#', '')}  \n\n---\n\n ## 翻译[{i//2}/{len(gpt_response_collection)//2}]:\n "
            else:
                gpt_response_collection[i] = gpt_response_collection[i]
        final = ["一、论文概况\n\n---\n\n", paper_meta_info.replace('# ', '### ') + '\n\n---\n\n', "二、论文翻译", ""]
        final.extend(gpt_response_collection)
        create_report_file_name = f"{os.path.basename(fp)}.trans.md"
        res = write_results_to_file(final, file_name=create_report_file_name)

        # 更新UI
        generated_conclusion_files.append(f'./gpt_log/{create_report_file_name}')
        chatbot.append((f"{fp}完成了吗?", res))
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 准备文件的下载
    import shutil
    for pdf_path in generated_conclusion_files:
        # 重命名文件
        rename_file = f'./gpt_log/总结论文-{os.path.basename(pdf_path)}'
        if os.path.exists(rename_file):
            os.remove(rename_file)
        shutil.copyfile(pdf_path, rename_file)
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
    chatbot.append(("给出输出文件清单", str(generated_conclusion_files)))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面