Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	改善chatpdf的功能
Browse files- crazy_functional.py +0 -7
- crazy_functions/crazy_utils.py +168 -0
- crazy_functions/批量翻译PDF文档_多线程.py +1 -166
- crazy_functions/理解PDF文档内容.py +56 -132
- version +2 -2
    	
        crazy_functional.py
    CHANGED
    
    | @@ -76,7 +76,6 @@ def get_crazy_functions(): | |
| 76 | 
             
                from crazy_functions.总结word文档 import 总结word文档
         | 
| 77 | 
             
                from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
         | 
| 78 | 
             
                from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
         | 
| 79 | 
            -
                from crazy_functions.理解PDF文档内容 import 理解PDF文档内容
         | 
| 80 | 
             
                from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
         | 
| 81 | 
             
                from crazy_functions.Latex全文润色 import Latex中文润色
         | 
| 82 | 
             
                from crazy_functions.Latex全文翻译 import Latex中译英
         | 
| @@ -108,11 +107,6 @@ def get_crazy_functions(): | |
| 108 | 
             
                        "Color": "stop",
         | 
| 109 | 
             
                        "Function": HotReload(总结word文档)
         | 
| 110 | 
             
                    },
         | 
| 111 | 
            -
                    # "[测试功能] 理解PDF文档内容(Tk文件选择接口,仅本地)": {
         | 
| 112 | 
            -
                    #     # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
         | 
| 113 | 
            -
                    #     "AsButton": False,  # 加入下拉菜单中
         | 
| 114 | 
            -
                    #     "Function": HotReload(理解PDF文档内容)
         | 
| 115 | 
            -
                    # },
         | 
| 116 | 
             
                    "[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": {
         | 
| 117 | 
             
                        # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
         | 
| 118 | 
             
                        "Color": "stop",
         | 
| @@ -131,7 +125,6 @@ def get_crazy_functions(): | |
| 131 | 
             
                        "AsButton": False,  # 加入下拉菜单中
         | 
| 132 | 
             
                        "Function": HotReload(Latex中文润色)
         | 
| 133 | 
             
                    },
         | 
| 134 | 
            -
             | 
| 135 | 
             
                    "[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
         | 
| 136 | 
             
                        # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
         | 
| 137 | 
             
                        "Color": "stop",
         | 
|  | |
| 76 | 
             
                from crazy_functions.总结word文档 import 总结word文档
         | 
| 77 | 
             
                from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
         | 
| 78 | 
             
                from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
         | 
|  | |
| 79 | 
             
                from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
         | 
| 80 | 
             
                from crazy_functions.Latex全文润色 import Latex中文润色
         | 
| 81 | 
             
                from crazy_functions.Latex全文翻译 import Latex中译英
         | 
|  | |
| 107 | 
             
                        "Color": "stop",
         | 
| 108 | 
             
                        "Function": HotReload(总结word文档)
         | 
| 109 | 
             
                    },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 110 | 
             
                    "[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": {
         | 
| 111 | 
             
                        # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
         | 
| 112 | 
             
                        "Color": "stop",
         | 
|  | |
| 125 | 
             
                        "AsButton": False,  # 加入下拉菜单中
         | 
| 126 | 
             
                        "Function": HotReload(Latex中文润色)
         | 
| 127 | 
             
                    },
         | 
|  | |
| 128 | 
             
                    "[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
         | 
| 129 | 
             
                        # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
         | 
| 130 | 
             
                        "Color": "stop",
         | 
    	
        crazy_functions/crazy_utils.py
    CHANGED
    
    | @@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit): | |
| 360 | 
             
                        # 这个中文的句号是故意的,作为一个标识而存在
         | 
| 361 | 
             
                        res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
         | 
| 362 | 
             
                        return [r.replace('。\n', '.') for r in res]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 360 | 
             
                        # 这个中文的句号是故意的,作为一个标识而存在
         | 
| 361 | 
             
                        res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
         | 
| 362 | 
             
                        return [r.replace('。\n', '.') for r in res]
         | 
| 363 | 
            +
             | 
| 364 | 
            +
             | 
| 365 | 
            +
             | 
| 366 | 
            +
            def read_and_clean_pdf_text(fp):
         | 
| 367 | 
            +
                """
         | 
| 368 | 
            +
                这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
         | 
| 369 | 
            +
             | 
| 370 | 
            +
                **输入参数说明**
         | 
| 371 | 
            +
                - `fp`:需要读取和清理文本的pdf文件路径
         | 
| 372 | 
            +
             | 
| 373 | 
            +
                **输出参数说明**
         | 
| 374 | 
            +
                - `meta_txt`:清理后的文本内容字符串
         | 
| 375 | 
            +
                - `page_one_meta`:第一页清理后的文本内容列表
         | 
| 376 | 
            +
             | 
| 377 | 
            +
                **函数功能**
         | 
| 378 | 
            +
                读取pdf文件并清理其中的文本内容,清理规则包括:
         | 
| 379 | 
            +
                - 提取所有块元的文本信息,并合并为一个字符串
         | 
| 380 | 
            +
                - 去除短块(字符数小于100)并替换为回车符
         | 
| 381 | 
            +
                - 清理多余的空行
         | 
| 382 | 
            +
                - 合并小写字母开头的段落块并替换为空格
         | 
| 383 | 
            +
                - 清除重复的换行
         | 
| 384 | 
            +
                - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
         | 
| 385 | 
            +
                """
         | 
| 386 | 
            +
                import fitz, copy
         | 
| 387 | 
            +
                import re
         | 
| 388 | 
            +
                import numpy as np
         | 
| 389 | 
            +
                from colorful import print亮黄, print亮绿
         | 
| 390 | 
            +
                fc = 0
         | 
| 391 | 
            +
                fs = 1
         | 
| 392 | 
            +
                fb = 2
         | 
| 393 | 
            +
                REMOVE_FOOT_NOTE = True
         | 
| 394 | 
            +
                REMOVE_FOOT_FFSIZE_PERCENT = 0.95 
         | 
| 395 | 
            +
                def primary_ffsize(l):
         | 
| 396 | 
            +
                    fsize_statiscs = {}
         | 
| 397 | 
            +
                    for wtf in l['spans']:
         | 
| 398 | 
            +
                        if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
         | 
| 399 | 
            +
                        fsize_statiscs[wtf['size']] += len(wtf['text'])
         | 
| 400 | 
            +
                    return max(fsize_statiscs, key=fsize_statiscs.get)
         | 
| 401 | 
            +
                    
         | 
| 402 | 
            +
                def ffsize_same(a,b):
         | 
| 403 | 
            +
                    return abs((a-b)/max(a,b)) < 0.02
         | 
| 404 | 
            +
                # file_content = ""
         | 
| 405 | 
            +
                with fitz.open(fp) as doc:
         | 
| 406 | 
            +
                    meta_txt = []
         | 
| 407 | 
            +
                    meta_font = []
         | 
| 408 | 
            +
             | 
| 409 | 
            +
                    meta_line = []
         | 
| 410 | 
            +
                    meta_span = []
         | 
| 411 | 
            +
                    for index, page in enumerate(doc):
         | 
| 412 | 
            +
                        # file_content += page.get_text()
         | 
| 413 | 
            +
                        text_areas = page.get_text("dict")  # 获取页面上的文本信息
         | 
| 414 | 
            +
                        for t in text_areas['blocks']:
         | 
| 415 | 
            +
                            if 'lines' in t:
         | 
| 416 | 
            +
                                pf = 998
         | 
| 417 | 
            +
                                for l in t['lines']:
         | 
| 418 | 
            +
                                    txt_line = "".join([wtf['text'] for wtf in l['spans']])
         | 
| 419 | 
            +
                                    pf = primary_ffsize(l)
         | 
| 420 | 
            +
                                    meta_line.append([txt_line, pf, l['bbox'], l])
         | 
| 421 | 
            +
                                    for wtf in l['spans']: # for l in t['lines']:
         | 
| 422 | 
            +
                                        meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
         | 
| 423 | 
            +
                                # meta_line.append(["NEW_BLOCK", pf])
         | 
| 424 | 
            +
                        # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
         | 
| 425 | 
            +
                        meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
         | 
| 426 | 
            +
                            '- ', '') for t in text_areas['blocks'] if 'lines' in t])
         | 
| 427 | 
            +
                        meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
         | 
| 428 | 
            +
                                         for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
         | 
| 429 | 
            +
                        if index == 0:
         | 
| 430 | 
            +
                            page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
         | 
| 431 | 
            +
                                '- ', '') for t in text_areas['blocks'] if 'lines' in t]
         | 
| 432 | 
            +
                    # 获取正文主字体
         | 
| 433 | 
            +
                    fsize_statiscs = {}
         | 
| 434 | 
            +
                    for span in meta_span:
         | 
| 435 | 
            +
                        if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
         | 
| 436 | 
            +
                        fsize_statiscs[span[1]] += span[2]
         | 
| 437 | 
            +
                    main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
         | 
| 438 | 
            +
                    if REMOVE_FOOT_NOTE:
         | 
| 439 | 
            +
                        give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
         | 
| 440 | 
            +
             | 
| 441 | 
            +
                    # 切分和重新整合
         | 
| 442 | 
            +
                    mega_sec = []
         | 
| 443 | 
            +
                    sec = []
         | 
| 444 | 
            +
                    for index, line in enumerate(meta_line):
         | 
| 445 | 
            +
                        if index == 0: 
         | 
| 446 | 
            +
                            sec.append(line[fc])
         | 
| 447 | 
            +
                            continue
         | 
| 448 | 
            +
                        if REMOVE_FOOT_NOTE:
         | 
| 449 | 
            +
                            if meta_line[index][fs] <= give_up_fize_threshold:
         | 
| 450 | 
            +
                                continue
         | 
| 451 | 
            +
                        if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
         | 
| 452 | 
            +
                            # 尝试识别段落
         | 
| 453 | 
            +
                            if meta_line[index][fc].endswith('.') and\
         | 
| 454 | 
            +
                                (meta_line[index-1][fc] != 'NEW_BLOCK') and \
         | 
| 455 | 
            +
                                (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
         | 
| 456 | 
            +
                                sec[-1] += line[fc]
         | 
| 457 | 
            +
                                sec[-1] += "\n\n"
         | 
| 458 | 
            +
                            else:
         | 
| 459 | 
            +
                                sec[-1] += " "
         | 
| 460 | 
            +
                                sec[-1] += line[fc]
         | 
| 461 | 
            +
                        else:
         | 
| 462 | 
            +
                            if (index+1 < len(meta_line)) and \
         | 
| 463 | 
            +
                                meta_line[index][fs] > main_fsize:
         | 
| 464 | 
            +
                                # 单行 + 字体大
         | 
| 465 | 
            +
                                mega_sec.append(copy.deepcopy(sec))
         | 
| 466 | 
            +
                                sec = []
         | 
| 467 | 
            +
                                sec.append("# " + line[fc])
         | 
| 468 | 
            +
                            else:
         | 
| 469 | 
            +
                                # 尝试识别section
         | 
| 470 | 
            +
                                if meta_line[index-1][fs] > meta_line[index][fs]:
         | 
| 471 | 
            +
                                    sec.append("\n" + line[fc])
         | 
| 472 | 
            +
                                else:
         | 
| 473 | 
            +
                                    sec.append(line[fc])
         | 
| 474 | 
            +
                    mega_sec.append(copy.deepcopy(sec))
         | 
| 475 | 
            +
             | 
| 476 | 
            +
                    finals = []
         | 
| 477 | 
            +
                    for ms in mega_sec:
         | 
| 478 | 
            +
                        final = " ".join(ms)
         | 
| 479 | 
            +
                        final = final.replace('- ', ' ')
         | 
| 480 | 
            +
                        finals.append(final)
         | 
| 481 | 
            +
                    meta_txt = finals
         | 
| 482 | 
            +
             | 
| 483 | 
            +
                    def 把字符太少的块清除为回车(meta_txt):
         | 
| 484 | 
            +
                        for index, block_txt in enumerate(meta_txt):
         | 
| 485 | 
            +
                            if len(block_txt) < 100:
         | 
| 486 | 
            +
                                meta_txt[index] = '\n'
         | 
| 487 | 
            +
                        return meta_txt
         | 
| 488 | 
            +
                    meta_txt = 把字符太少的块清除为回车(meta_txt)
         | 
| 489 | 
            +
             | 
| 490 | 
            +
                    def 清理多余的空行(meta_txt):
         | 
| 491 | 
            +
                        for index in reversed(range(1, len(meta_txt))):
         | 
| 492 | 
            +
                            if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
         | 
| 493 | 
            +
                                meta_txt.pop(index)
         | 
| 494 | 
            +
                        return meta_txt
         | 
| 495 | 
            +
                    meta_txt = 清理多余的空行(meta_txt)
         | 
| 496 | 
            +
             | 
| 497 | 
            +
                    def 合并小写开头的段落块(meta_txt):
         | 
| 498 | 
            +
                        def starts_with_lowercase_word(s):
         | 
| 499 | 
            +
                            pattern = r"^[a-z]+"
         | 
| 500 | 
            +
                            match = re.match(pattern, s)
         | 
| 501 | 
            +
                            if match:
         | 
| 502 | 
            +
                                return True
         | 
| 503 | 
            +
                            else:
         | 
| 504 | 
            +
                                return False
         | 
| 505 | 
            +
                        for _ in range(100):
         | 
| 506 | 
            +
                            for index, block_txt in enumerate(meta_txt):
         | 
| 507 | 
            +
                                if starts_with_lowercase_word(block_txt):
         | 
| 508 | 
            +
                                    if meta_txt[index-1] != '\n':
         | 
| 509 | 
            +
                                        meta_txt[index-1] += ' '
         | 
| 510 | 
            +
                                    else:
         | 
| 511 | 
            +
                                        meta_txt[index-1] = ''
         | 
| 512 | 
            +
                                    meta_txt[index-1] += meta_txt[index]
         | 
| 513 | 
            +
                                    meta_txt[index] = '\n'
         | 
| 514 | 
            +
                        return meta_txt
         | 
| 515 | 
            +
                    meta_txt = 合并小写开头的段落块(meta_txt)
         | 
| 516 | 
            +
                    meta_txt = 清理多余的空行(meta_txt)
         | 
| 517 | 
            +
             | 
| 518 | 
            +
                    meta_txt = '\n'.join(meta_txt)
         | 
| 519 | 
            +
                    # 清除重复的换行
         | 
| 520 | 
            +
                    for _ in range(5):
         | 
| 521 | 
            +
                        meta_txt = meta_txt.replace('\n\n', '\n')
         | 
| 522 | 
            +
             | 
| 523 | 
            +
                    # 换行 -> 双换行
         | 
| 524 | 
            +
                    meta_txt = meta_txt.replace('\n', '\n\n')
         | 
| 525 | 
            +
             | 
| 526 | 
            +
                    for f in finals:
         | 
| 527 | 
            +
                        print亮黄(f)
         | 
| 528 | 
            +
                        print亮绿('***************************')
         | 
| 529 | 
            +
             | 
| 530 | 
            +
                return meta_txt, page_one_meta
         | 
    	
        crazy_functions/批量翻译PDF文档_多线程.py
    CHANGED
    
    | @@ -2,174 +2,9 @@ from toolbox import CatchException, report_execption, write_results_to_file | |
| 2 | 
             
            from toolbox import update_ui
         | 
| 3 | 
             
            from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
         | 
| 4 | 
             
            from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
         | 
|  | |
| 5 | 
             
            from colorful import *
         | 
| 6 |  | 
| 7 | 
            -
            def read_and_clean_pdf_text(fp):
         | 
| 8 | 
            -
                """
         | 
| 9 | 
            -
                这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好,不建议任何人去读这个函数
         | 
| 10 | 
            -
             | 
| 11 | 
            -
                **输入参数说明**
         | 
| 12 | 
            -
                - `fp`:需要读取和清理文本的pdf文件路径
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                **输出参数说明**
         | 
| 15 | 
            -
                - `meta_txt`:清理后的文本内容字符串
         | 
| 16 | 
            -
                - `page_one_meta`:第一页清理后的文本内容列表
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                **函数功能**
         | 
| 19 | 
            -
                读取pdf文件并清理其中的文本内容,清理规则包括:
         | 
| 20 | 
            -
                - 提取所有块元的文本信息,并合并为一个字符串
         | 
| 21 | 
            -
                - 去除短块(字符数小于100)并替换为回车符
         | 
| 22 | 
            -
                - 清理多余的空行
         | 
| 23 | 
            -
                - 合并小写字母开头的段落块并替换为空格
         | 
| 24 | 
            -
                - 清除重复的换行
         | 
| 25 | 
            -
                - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
         | 
| 26 | 
            -
                """
         | 
| 27 | 
            -
                import fitz, copy
         | 
| 28 | 
            -
                import re
         | 
| 29 | 
            -
                import numpy as np
         | 
| 30 | 
            -
                fc = 0
         | 
| 31 | 
            -
                fs = 1
         | 
| 32 | 
            -
                fb = 2
         | 
| 33 | 
            -
                REMOVE_FOOT_NOTE = True
         | 
| 34 | 
            -
                REMOVE_FOOT_FFSIZE_PERCENT = 0.95 
         | 
| 35 | 
            -
                def primary_ffsize(l):
         | 
| 36 | 
            -
                    fsize_statiscs = {}
         | 
| 37 | 
            -
                    for wtf in l['spans']:
         | 
| 38 | 
            -
                        if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
         | 
| 39 | 
            -
                        fsize_statiscs[wtf['size']] += len(wtf['text'])
         | 
| 40 | 
            -
                    return max(fsize_statiscs, key=fsize_statiscs.get)
         | 
| 41 | 
            -
                    
         | 
| 42 | 
            -
                def ffsize_same(a,b):
         | 
| 43 | 
            -
                    return abs((a-b)/max(a,b)) < 0.02
         | 
| 44 | 
            -
                # file_content = ""
         | 
| 45 | 
            -
                with fitz.open(fp) as doc:
         | 
| 46 | 
            -
                    meta_txt = []
         | 
| 47 | 
            -
                    meta_font = []
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                    meta_line = []
         | 
| 50 | 
            -
                    meta_span = []
         | 
| 51 | 
            -
                    for index, page in enumerate(doc):
         | 
| 52 | 
            -
                        # file_content += page.get_text()
         | 
| 53 | 
            -
                        text_areas = page.get_text("dict")  # 获取页面上的文本信息
         | 
| 54 | 
            -
                        for t in text_areas['blocks']:
         | 
| 55 | 
            -
                            if 'lines' in t:
         | 
| 56 | 
            -
                                pf = 998
         | 
| 57 | 
            -
                                for l in t['lines']:
         | 
| 58 | 
            -
                                    txt_line = "".join([wtf['text'] for wtf in l['spans']])
         | 
| 59 | 
            -
                                    pf = primary_ffsize(l)
         | 
| 60 | 
            -
                                    meta_line.append([txt_line, pf, l['bbox'], l])
         | 
| 61 | 
            -
                                    for wtf in l['spans']: # for l in t['lines']:
         | 
| 62 | 
            -
                                        meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
         | 
| 63 | 
            -
                                # meta_line.append(["NEW_BLOCK", pf])
         | 
| 64 | 
            -
                        # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
         | 
| 65 | 
            -
                        meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
         | 
| 66 | 
            -
                            '- ', '') for t in text_areas['blocks'] if 'lines' in t])
         | 
| 67 | 
            -
                        meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
         | 
| 68 | 
            -
                                         for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
         | 
| 69 | 
            -
                        if index == 0:
         | 
| 70 | 
            -
                            page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
         | 
| 71 | 
            -
                                '- ', '') for t in text_areas['blocks'] if 'lines' in t]
         | 
| 72 | 
            -
                    # 获取正文主字体
         | 
| 73 | 
            -
                    fsize_statiscs = {}
         | 
| 74 | 
            -
                    for span in meta_span:
         | 
| 75 | 
            -
                        if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
         | 
| 76 | 
            -
                        fsize_statiscs[span[1]] += span[2]
         | 
| 77 | 
            -
                    main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
         | 
| 78 | 
            -
                    if REMOVE_FOOT_NOTE:
         | 
| 79 | 
            -
                        give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
         | 
| 80 | 
            -
             | 
| 81 | 
            -
                    # 切分和重新整合
         | 
| 82 | 
            -
                    mega_sec = []
         | 
| 83 | 
            -
                    sec = []
         | 
| 84 | 
            -
                    for index, line in enumerate(meta_line):
         | 
| 85 | 
            -
                        if index == 0: 
         | 
| 86 | 
            -
                            sec.append(line[fc])
         | 
| 87 | 
            -
                            continue
         | 
| 88 | 
            -
                        if REMOVE_FOOT_NOTE:
         | 
| 89 | 
            -
                            if meta_line[index][fs] <= give_up_fize_threshold:
         | 
| 90 | 
            -
                                continue
         | 
| 91 | 
            -
                        if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
         | 
| 92 | 
            -
                            # 尝试识别段落
         | 
| 93 | 
            -
                            if meta_line[index][fc].endswith('.') and\
         | 
| 94 | 
            -
                                (meta_line[index-1][fc] != 'NEW_BLOCK') and \
         | 
| 95 | 
            -
                                (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
         | 
| 96 | 
            -
                                sec[-1] += line[fc]
         | 
| 97 | 
            -
                                sec[-1] += "\n\n"
         | 
| 98 | 
            -
                            else:
         | 
| 99 | 
            -
                                sec[-1] += " "
         | 
| 100 | 
            -
                                sec[-1] += line[fc]
         | 
| 101 | 
            -
                        else:
         | 
| 102 | 
            -
                            if (index+1 < len(meta_line)) and \
         | 
| 103 | 
            -
                                meta_line[index][fs] > main_fsize:
         | 
| 104 | 
            -
                                # 单行 + 字体大
         | 
| 105 | 
            -
                                mega_sec.append(copy.deepcopy(sec))
         | 
| 106 | 
            -
                                sec = []
         | 
| 107 | 
            -
                                sec.append("# " + line[fc])
         | 
| 108 | 
            -
                            else:
         | 
| 109 | 
            -
                                # 尝试识别section
         | 
| 110 | 
            -
                                if meta_line[index-1][fs] > meta_line[index][fs]:
         | 
| 111 | 
            -
                                    sec.append("\n" + line[fc])
         | 
| 112 | 
            -
                                else:
         | 
| 113 | 
            -
                                    sec.append(line[fc])
         | 
| 114 | 
            -
                    mega_sec.append(copy.deepcopy(sec))
         | 
| 115 | 
            -
             | 
| 116 | 
            -
                    finals = []
         | 
| 117 | 
            -
                    for ms in mega_sec:
         | 
| 118 | 
            -
                        final = " ".join(ms)
         | 
| 119 | 
            -
                        final = final.replace('- ', ' ')
         | 
| 120 | 
            -
                        finals.append(final)
         | 
| 121 | 
            -
                    meta_txt = finals
         | 
| 122 | 
            -
             | 
| 123 | 
            -
                    def 把字符太少的块清除为回车(meta_txt):
         | 
| 124 | 
            -
                        for index, block_txt in enumerate(meta_txt):
         | 
| 125 | 
            -
                            if len(block_txt) < 100:
         | 
| 126 | 
            -
                                meta_txt[index] = '\n'
         | 
| 127 | 
            -
                        return meta_txt
         | 
| 128 | 
            -
                    meta_txt = 把字符太少的块清除为回车(meta_txt)
         | 
| 129 | 
            -
             | 
| 130 | 
            -
                    def 清理多余的空行(meta_txt):
         | 
| 131 | 
            -
                        for index in reversed(range(1, len(meta_txt))):
         | 
| 132 | 
            -
                            if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
         | 
| 133 | 
            -
                                meta_txt.pop(index)
         | 
| 134 | 
            -
                        return meta_txt
         | 
| 135 | 
            -
                    meta_txt = 清理多余的空行(meta_txt)
         | 
| 136 | 
            -
             | 
| 137 | 
            -
                    def 合并小写开头的段落块(meta_txt):
         | 
| 138 | 
            -
                        def starts_with_lowercase_word(s):
         | 
| 139 | 
            -
                            pattern = r"^[a-z]+"
         | 
| 140 | 
            -
                            match = re.match(pattern, s)
         | 
| 141 | 
            -
                            if match:
         | 
| 142 | 
            -
                                return True
         | 
| 143 | 
            -
                            else:
         | 
| 144 | 
            -
                                return False
         | 
| 145 | 
            -
                        for _ in range(100):
         | 
| 146 | 
            -
                            for index, block_txt in enumerate(meta_txt):
         | 
| 147 | 
            -
                                if starts_with_lowercase_word(block_txt):
         | 
| 148 | 
            -
                                    if meta_txt[index-1] != '\n':
         | 
| 149 | 
            -
                                        meta_txt[index-1] += ' '
         | 
| 150 | 
            -
                                    else:
         | 
| 151 | 
            -
                                        meta_txt[index-1] = ''
         | 
| 152 | 
            -
                                    meta_txt[index-1] += meta_txt[index]
         | 
| 153 | 
            -
                                    meta_txt[index] = '\n'
         | 
| 154 | 
            -
                        return meta_txt
         | 
| 155 | 
            -
                    meta_txt = 合并小写开头的段落块(meta_txt)
         | 
| 156 | 
            -
                    meta_txt = 清理多余的空行(meta_txt)
         | 
| 157 | 
            -
             | 
| 158 | 
            -
                    meta_txt = '\n'.join(meta_txt)
         | 
| 159 | 
            -
                    # 清除重复的换行
         | 
| 160 | 
            -
                    for _ in range(5):
         | 
| 161 | 
            -
                        meta_txt = meta_txt.replace('\n\n', '\n')
         | 
| 162 | 
            -
             | 
| 163 | 
            -
                    # 换行 -> 双换行
         | 
| 164 | 
            -
                    meta_txt = meta_txt.replace('\n', '\n\n')
         | 
| 165 | 
            -
             | 
| 166 | 
            -
                    for f in finals:
         | 
| 167 | 
            -
                        print亮黄(f)
         | 
| 168 | 
            -
                        print亮绿('***************************')
         | 
| 169 | 
            -
             | 
| 170 | 
            -
                return meta_txt, page_one_meta
         | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
             
            @CatchException
         | 
| 174 | 
             
            def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
         | 
| 175 | 
             
                import glob
         | 
|  | |
| 2 | 
             
            from toolbox import update_ui
         | 
| 3 | 
             
            from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
         | 
| 4 | 
             
            from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
         | 
| 5 | 
            +
            from .crazy_utils import read_and_clean_pdf_text
         | 
| 6 | 
             
            from colorful import *
         | 
| 7 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 8 | 
             
            @CatchException
         | 
| 9 | 
             
            def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
         | 
| 10 | 
             
                import glob
         | 
    	
        crazy_functions/理解PDF文档内容.py
    CHANGED
    
    | @@ -1,142 +1,66 @@ | |
| 1 | 
             
            from toolbox import update_ui
         | 
| 2 | 
             
            from toolbox import CatchException, report_execption
         | 
| 3 | 
            -
            import  | 
| 4 | 
            -
            import unicodedata
         | 
| 5 | 
             
            from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
         | 
| 6 | 
             
            fast_debug = False
         | 
| 7 |  | 
| 8 | 
            -
            def is_paragraph_break(match):
         | 
| 9 | 
            -
                """
         | 
| 10 | 
            -
                根据给定的匹配结果来判断换行符是否表示段落分隔。
         | 
| 11 | 
            -
                如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
         | 
| 12 | 
            -
                也可以根据之前的内容长度来判断段落是否已经足够长。
         | 
| 13 | 
            -
                """
         | 
| 14 | 
            -
                prev_char, next_char = match.groups()
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                # 句子结束标志
         | 
| 17 | 
            -
                sentence_endings = ".!?"
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                # 设定一个最小段落长度阈值
         | 
| 20 | 
            -
                min_paragraph_length = 140
         | 
| 21 | 
            -
             | 
| 22 | 
            -
                if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
         | 
| 23 | 
            -
                    return "\n\n" 
         | 
| 24 | 
            -
                else:
         | 
| 25 | 
            -
                    return " "
         | 
| 26 | 
            -
             | 
| 27 | 
            -
            def normalize_text(text):
         | 
| 28 | 
            -
                """
         | 
| 29 | 
            -
                通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
         | 
| 30 | 
            -
                例如,将连字 "fi" 转换为 "f" 和 "i"。
         | 
| 31 | 
            -
                """
         | 
| 32 | 
            -
                # 对文本进行归一化处理,分解连字
         | 
| 33 | 
            -
                normalized_text = unicodedata.normalize("NFKD", text)
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                # 替换其他特殊字符
         | 
| 36 | 
            -
                cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                return cleaned_text
         | 
| 39 | 
            -
             | 
| 40 | 
            -
            def clean_text(raw_text):
         | 
| 41 | 
            -
                """
         | 
| 42 | 
            -
                对从 PDF 提取出的原始文本进行清洗和格式化处理。
         | 
| 43 | 
            -
                1. 对原始文本进行归一化处理。
         | 
| 44 | 
            -
                2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。
         | 
| 45 | 
            -
                3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。
         | 
| 46 | 
            -
                """
         | 
| 47 | 
            -
                # 对文本进行归一化处理
         | 
| 48 | 
            -
                normalized_text = normalize_text(raw_text)
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                # 替换跨行的连词
         | 
| 51 | 
            -
                text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
         | 
| 52 | 
            -
             | 
| 53 | 
            -
                # 根据前后相邻字符的特点,找到原文本中的换行符
         | 
| 54 | 
            -
                newlines = re.compile(r'(\S)\n(\S)')
         | 
| 55 | 
            -
             | 
| 56 | 
            -
                # 根据 heuristic 规则,用空格或段落分隔符替换原换行符
         | 
| 57 | 
            -
                final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                return final_text.strip()
         | 
| 60 |  | 
| 61 | 
             
            def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
         | 
| 62 | 
            -
                import  | 
| 63 | 
             
                print('begin analysis on:', file_name)
         | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
                 | 
| 72 | 
            -
                 | 
| 73 | 
            -
                 | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
                 | 
| 110 | 
            -
             | 
| 111 | 
            -
                 | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
                 | 
| 116 | 
            -
             | 
| 117 | 
            -
                import tkinter as tk
         | 
| 118 | 
            -
                from tkinter import filedialog
         | 
| 119 | 
            -
             | 
| 120 | 
            -
                root = tk.Tk()
         | 
| 121 | 
            -
                root.withdraw()
         | 
| 122 | 
            -
                txt = filedialog.askopenfilename()
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                # 尝试导入依赖,如果缺少依赖,则给出安装建议
         | 
| 125 | 
            -
                try:
         | 
| 126 | 
            -
                    import fitz
         | 
| 127 | 
            -
                except:
         | 
| 128 | 
            -
                    report_execption(chatbot, history, 
         | 
| 129 | 
            -
                        a = f"解析项目: {txt}", 
         | 
| 130 | 
            -
                        b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
         | 
| 131 | 
            -
                    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
         | 
| 132 | 
            -
                    return
         | 
| 133 | 
            -
             | 
| 134 | 
            -
                # 清空历史,以免输入溢出
         | 
| 135 | 
            -
                history = []
         | 
| 136 | 
            -
             | 
| 137 | 
            -
                # 开始正式执行任务
         | 
| 138 | 
            -
                yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
         | 
| 139 | 
            -
             | 
| 140 |  | 
| 141 |  | 
| 142 | 
             
            @CatchException
         | 
| @@ -146,7 +70,7 @@ def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chat | |
| 146 | 
             
                # 基本信息:功能、贡献者
         | 
| 147 | 
             
                chatbot.append([
         | 
| 148 | 
             
                    "函数插件功能?",
         | 
| 149 | 
            -
                    "理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe | 
| 150 | 
             
                yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
         | 
| 151 |  | 
| 152 | 
             
                # 尝试导入依赖,如果缺少依赖,则给出安装建议
         | 
|  | |
| 1 | 
             
            from toolbox import update_ui
         | 
| 2 | 
             
            from toolbox import CatchException, report_execption
         | 
| 3 | 
            +
            from .crazy_utils import read_and_clean_pdf_text
         | 
|  | |
| 4 | 
             
            from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
         | 
| 5 | 
             
            fast_debug = False
         | 
| 6 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 7 |  | 
| 8 | 
             
            def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
         | 
| 9 | 
            +
                import tiktoken
         | 
| 10 | 
             
                print('begin analysis on:', file_name)
         | 
| 11 | 
            +
                file_content, page_one = read_and_clean_pdf_text(file_name)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
         | 
| 14 | 
            +
                # 的长度必须小于 2500 个 Token
         | 
| 15 | 
            +
                TOKEN_LIMIT_PER_FRAGMENT = 2500
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         | 
| 18 | 
            +
                from toolbox import get_conf
         | 
| 19 | 
            +
                enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
         | 
| 20 | 
            +
                def get_token_num(txt): return len(enc.encode(txt))
         | 
| 21 | 
            +
                paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         | 
| 22 | 
            +
                    txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         | 
| 23 | 
            +
                page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         | 
| 24 | 
            +
                    txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
         | 
| 25 | 
            +
                # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
         | 
| 26 | 
            +
                paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                ############################## <第一步,从摘要中提取高价值信息,放到history中> ##################################
         | 
| 29 | 
            +
                final_results = []
         | 
| 30 | 
            +
                final_results.append(paper_meta)
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                ############################## <第二步,迭代地历遍整个文章,提取精炼信息> ##################################
         | 
| 33 | 
            +
                i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
         | 
| 34 | 
            +
                chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                iteration_results = []
         | 
| 37 | 
            +
                last_iteration_result = paper_meta  # 初始值是摘要
         | 
| 38 | 
            +
                MAX_WORD_TOTAL = 4096
         | 
| 39 | 
            +
                n_fragment = len(paper_fragments)
         | 
| 40 | 
            +
                if n_fragment >= 20: print('文章极长,不能达到预期效果')
         | 
| 41 | 
            +
                for i in range(n_fragment):
         | 
| 42 | 
            +
                    NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
         | 
| 43 | 
            +
                    i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
         | 
| 44 | 
            +
                    i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
         | 
| 45 | 
            +
                    gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
         | 
| 46 | 
            +
                                                                                       llm_kwargs, chatbot, 
         | 
| 47 | 
            +
                                                                                       history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
         | 
| 48 | 
            +
                                                                                       sys_prompt="Extract the main idea of this section."  # 提示
         | 
| 49 | 
            +
                                                                                    ) 
         | 
| 50 | 
            +
                    iteration_results.append(gpt_say)
         | 
| 51 | 
            +
                    last_iteration_result = gpt_say
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                ############################## <第三步,整理history> ##################################
         | 
| 54 | 
            +
                final_results.extend(iteration_results)
         | 
| 55 | 
            +
                final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。')
         | 
| 56 | 
            +
                # 接下来两句话只显示在界面上,不起实际作用
         | 
| 57 | 
            +
                i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
         | 
| 58 | 
            +
                chatbot.append([i_say_show_user, gpt_say])
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                ############################## <第四步,设置一个token上限,防止回答时Token溢出> ##################################
         | 
| 61 | 
            +
                from .crazy_utils import input_clipping
         | 
| 62 | 
            +
                _, final_results = input_clipping("", final_results, max_token_limit=3200)
         | 
| 63 | 
            +
                yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 64 |  | 
| 65 |  | 
| 66 | 
             
            @CatchException
         | 
|  | |
| 70 | 
             
                # 基本信息:功能、贡献者
         | 
| 71 | 
             
                chatbot.append([
         | 
| 72 | 
             
                    "函数插件功能?",
         | 
| 73 | 
            +
                    "理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe, binary-husky"])
         | 
| 74 | 
             
                yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
         | 
| 75 |  | 
| 76 | 
             
                # 尝试导入依赖,如果缺少依赖,则给出安装建议
         | 
    	
        version
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
              "version": 2. | 
| 3 | 
             
              "show_feature": true,
         | 
| 4 | 
            -
              "new_feature": " | 
| 5 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "version": 2.68,
         | 
| 3 | 
             
              "show_feature": true,
         | 
| 4 | 
            +
              "new_feature": "改善理解pdf(chatpdf)功能 <-> 如果一键更新失败,可前往github手动更新"
         | 
| 5 | 
             
            }
         |