gpt-academic

Build error

App Files Files Community

qingxu99 commited on Apr 13, 2023

Commit

fc222bf

•

1 Parent(s): 0b1d833

Lua工程解析+修正注释

Browse files

Files changed (3) hide show

crazy_functional.py +7 -9
crazy_functions/crazy_utils.py +18 -8
crazy_functions/理解PDF文档内容.py +7 -6

crazy_functional.py CHANGED Viewed

@@ -16,7 +16,7 @@ def get_crazy_functions():
     from crazy_functions.高级功能函数模板 import 高阶功能模板函数
     from crazy_functions.代码重写为全英文_多线程 import 全项目切换英文
     from crazy_functions.Latex全文润色 import Latex英文润色
     function_plugins = {
         "解析整个Python项目": {
@@ -47,6 +47,11 @@ def get_crazy_functions():
             "AsButton": False,  # 加入下拉菜单中
             "Function": HotReload(解析一个Rect项目)
         },
         "读Tex论文写摘要": {
             "Color": "stop",    # 按钮颜色
             "Function": HotReload(读文章写摘要)
@@ -156,14 +161,7 @@ def get_crazy_functions():
     except Exception as err:
         print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
-    from crazy_functions.解析项目源代码 import 解析一个Lua项目
-    function_plugins.update({
-        "解析整个Lua项目": {
-            "Color": "stop",    # 按钮颜色
-            "AsButton": False,  # 加入下拉菜单中
-            "Function": HotReload(解析一个Lua项目)
-        },
-    })
     ###################### 第n组插件 ###########################
     return function_plugins

     from crazy_functions.高级功能函数模板 import 高阶功能模板函数
     from crazy_functions.代码重写为全英文_多线程 import 全项目切换英文
     from crazy_functions.Latex全文润色 import Latex英文润色
+    from crazy_functions.解析项目源代码 import 解析一个Lua项目
     function_plugins = {
         "解析整个Python项目": {
             "AsButton": False,  # 加入下拉菜单中
             "Function": HotReload(解析一个Rect项目)
         },
+        "解析整个Lua项目": {
+            "Color": "stop",    # 按钮颜色
+            "AsButton": False,  # 加入下拉菜单中
+            "Function": HotReload(解析一个Lua项目)
+        },
         "读Tex论文写摘要": {
             "Color": "stop",    # 按钮颜色
             "Function": HotReload(读文章写摘要)
     except Exception as err:
         print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
     ###################### 第n组插件 ###########################
     return function_plugins

crazy_functions/crazy_utils.py CHANGED Viewed

@@ -387,12 +387,15 @@ def read_and_clean_pdf_text(fp):
     import re
     import numpy as np
     from colorful import print亮黄, print亮绿
-    fc = 0
-    fs = 1
-    fb = 2
-    REMOVE_FOOT_NOTE = True
-    REMOVE_FOOT_FFSIZE_PERCENT = 0.95
     def primary_ffsize(l):
         fsize_statiscs = {}
         for wtf in l['spans']:
             if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
@@ -400,14 +403,18 @@ def read_and_clean_pdf_text(fp):
         return max(fsize_statiscs, key=fsize_statiscs.get)
     def ffsize_same(a,b):
         return abs((a-b)/max(a,b)) < 0.02
-    # file_content = ""
     with fitz.open(fp) as doc:
         meta_txt = []
         meta_font = []
         meta_line = []
         meta_span = []
         for index, page in enumerate(doc):
             # file_content += page.get_text()
             text_areas = page.get_text("dict")  # 获取页面上的文本信息
@@ -429,7 +436,8 @@ def read_and_clean_pdf_text(fp):
             if index == 0:
                 page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
                     '- ', '') for t in text_areas['blocks'] if 'lines' in t]
-        # 获取正文主字体
         fsize_statiscs = {}
         for span in meta_span:
             if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
@@ -438,7 +446,7 @@ def read_and_clean_pdf_text(fp):
         if REMOVE_FOOT_NOTE:
             give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
-        # 切分和重新整合
         mega_sec = []
         sec = []
         for index, line in enumerate(meta_line):
@@ -480,6 +488,7 @@ def read_and_clean_pdf_text(fp):
             finals.append(final)
         meta_txt = finals
         def 把字符太少的块清除为回车(meta_txt):
             for index, block_txt in enumerate(meta_txt):
                 if len(block_txt) < 100:
@@ -523,6 +532,7 @@ def read_and_clean_pdf_text(fp):
         # 换行 -> 双换行
         meta_txt = meta_txt.replace('\n', '\n\n')
         for f in finals:
             print亮黄(f)
             print亮绿('***************************')

     import re
     import numpy as np
     from colorful import print亮黄, print亮绿
+    fc = 0  # Index 0 文本
+    fs = 1  # Index 1 字体
+    fb = 2  # Index 2 框框
+    REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 （比正文字体小，如参考文献、脚注、图注等）
+    REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的？时，判定为不是正文（有些文章的正文部分字体大小不是100%统一的，有肉眼不可见的小变化）
     def primary_ffsize(l):
+        """
+        提取文本块主字体
+        """
         fsize_statiscs = {}
         for wtf in l['spans']:
             if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
         return max(fsize_statiscs, key=fsize_statiscs.get)
     def ffsize_same(a,b):
+        """
+        提取字体大小是否近似相等
+        """
         return abs((a-b)/max(a,b)) < 0.02
     with fitz.open(fp) as doc:
         meta_txt = []
         meta_font = []
         meta_line = []
         meta_span = []
+        ############################## <第 1 步，搜集初始信息> ##################################
         for index, page in enumerate(doc):
             # file_content += page.get_text()
             text_areas = page.get_text("dict")  # 获取页面上的文本信息
             if index == 0:
                 page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
                     '- ', '') for t in text_areas['blocks'] if 'lines' in t]
+        ############################## <第 2 步，获取正文主字体> ##################################
         fsize_statiscs = {}
         for span in meta_span:
             if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
         if REMOVE_FOOT_NOTE:
             give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
+        ############################## <第 3 步，切分和重新整合> ##################################
         mega_sec = []
         sec = []
         for index, line in enumerate(meta_line):
             finals.append(final)
         meta_txt = finals
+        ############################## <第 4 步，乱七八糟的后处理> ##################################
         def 把字符太少的块清除为回车(meta_txt):
             for index, block_txt in enumerate(meta_txt):
                 if len(block_txt) < 100:
         # 换行 -> 双换行
         meta_txt = meta_txt.replace('\n', '\n\n')
+        ############################## <第 5 步，展示分割效果> ##################################
         for f in finals:
             print亮黄(f)
             print亮绿('***************************')

crazy_functions/理解PDF文档内容.py CHANGED Viewed

@@ -8,11 +8,12 @@ fast_debug = False
 def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
     import tiktoken
     print('begin analysis on:', file_name)
-    file_content, page_one = read_and_clean_pdf_text(file_name)
-    ############################## <第零步，从摘要中提取高价值信息，放到history中> ##################################
     # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
     # 的长度必须小于 2500 个 Token
     TOKEN_LIMIT_PER_FRAGMENT = 2500
     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
@@ -26,11 +27,11 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
     # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
     paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
-    ############################## <第一步，从摘要中提取高价值信息，放到history中> ##################################
     final_results = []
     final_results.append(paper_meta)
-    ############################## <第二步，迭代地历遍整个文章，提取精炼信息> ##################################
     i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
     chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
@@ -51,14 +52,14 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
         iteration_results.append(gpt_say)
         last_iteration_result = gpt_say
-    ############################## <第三步，整理history> ##################################
     final_results.extend(iteration_results)
     final_results.append(f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。')
     # 接下来两句话只显示在界面上，不起实际作用
     i_say_show_user = f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
     chatbot.append([i_say_show_user, gpt_say])
-    ############################## <第四步，设置一个token上限，防止回答时Token溢出> ##################################
     from .crazy_utils import input_clipping
     _, final_results = input_clipping("", final_results, max_token_limit=3200)
     yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了

 def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
     import tiktoken
     print('begin analysis on:', file_name)
+    ############################## <第 0 步，切割PDF> ##################################
     # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
     # 的长度必须小于 2500 个 Token
+    file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
     TOKEN_LIMIT_PER_FRAGMENT = 2500
     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
     # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
     paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
+    ############################## <第 1 步，从摘要中提取高价值信息，放到history中> ##################################
     final_results = []
     final_results.append(paper_meta)
+    ############################## <第 2 步，迭代地历遍整个文章，提取精炼信息> ##################################
     i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
     chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
         iteration_results.append(gpt_say)
         last_iteration_result = gpt_say
+    ############################## <第 3 步，整理history> ##################################
     final_results.extend(iteration_results)
     final_results.append(f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。')
     # 接下来两句话只显示在界面上，不起实际作用
     i_say_show_user = f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
     chatbot.append([i_say_show_user, gpt_say])
+    ############################## <第 4 步，设置一个token上限，防止回答时Token溢出> ##################################
     from .crazy_utils import input_clipping
     _, final_results = input_clipping("", final_results, max_token_limit=3200)
     yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了