|
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str |
|
from toolbox import CatchException, report_exception |
|
from toolbox import write_history_to_file, promote_file_to_downloadzone |
|
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive |
|
from .crazy_utils import read_and_clean_pdf_text |
|
from .crazy_utils import input_clipping |
|
|
|
|
|
|
|
def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): |
|
file_write_buffer = [] |
|
for file_name in file_manifest: |
|
print('begin analysis on:', file_name) |
|
|
|
|
|
|
|
file_content, page_one = read_and_clean_pdf_text(file_name) |
|
file_content = file_content.encode('utf-8', 'ignore').decode() |
|
page_one = str(page_one).encode('utf-8', 'ignore').decode() |
|
|
|
TOKEN_LIMIT_PER_FRAGMENT = 2500 |
|
|
|
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit |
|
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) |
|
page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) |
|
|
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] |
|
|
|
|
|
final_results = [] |
|
final_results.append(paper_meta) |
|
|
|
|
|
i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" |
|
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) |
|
|
|
iteration_results = [] |
|
last_iteration_result = paper_meta |
|
MAX_WORD_TOTAL = 4096 * 0.7 |
|
n_fragment = len(paper_fragments) |
|
if n_fragment >= 20: print('文章极长,不能达到预期效果') |
|
for i in range(n_fragment): |
|
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment |
|
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}" |
|
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}" |
|
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, |
|
llm_kwargs, chatbot, |
|
history=["The main idea of the previous section is?", last_iteration_result], |
|
sys_prompt="Extract the main idea of this section with Chinese." |
|
) |
|
iteration_results.append(gpt_say) |
|
last_iteration_result = gpt_say |
|
|
|
|
|
final_results.extend(iteration_results) |
|
final_results.append(f'Please conclude this paper discussed above。') |
|
|
|
NUM_OF_WORD = 1000 |
|
i_say = """ |
|
1. Mark the title of the paper (with Chinese translation) |
|
2. list all the authors' names (use English) |
|
3. mark the first author's affiliation (output Chinese translation only) |
|
4. mark the keywords of this article (use English) |
|
5. link to the paper, Github code link (if available, fill in Github:None if not) |
|
6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English) |
|
- (1):What is the research background of this article? |
|
- (2):What are the past methods? What are the problems with them? Is the approach well motivated? |
|
- (3):What is the research methodology proposed in this paper? |
|
- (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals? |
|
Follow the format of the output that follows: |
|
1. Title: xxx\n\n |
|
2. Authors: xxx\n\n |
|
3. Affiliation: xxx\n\n |
|
4. Keywords: xxx\n\n |
|
5. Urls: xxx or xxx , xxx \n\n |
|
6. Summary: \n\n |
|
- (1):xxx;\n |
|
- (2):xxx;\n |
|
- (3):xxx;\n |
|
- (4):xxx.\n\n |
|
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, |
|
do not have too much repetitive information, numerical values using the original numbers. |
|
""" |
|
|
|
file_write_buffer.extend(final_results) |
|
i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000) |
|
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( |
|
inputs=i_say, inputs_show_user='开始最终总结', |
|
llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results, |
|
sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters" |
|
) |
|
final_results.append(gpt_say) |
|
file_write_buffer.extend([i_say, gpt_say]) |
|
|
|
_, final_results = input_clipping("", final_results, max_token_limit=3200) |
|
yield from update_ui(chatbot=chatbot, history=final_results) |
|
|
|
res = write_history_to_file(file_write_buffer) |
|
promote_file_to_downloadzone(res, chatbot=chatbot) |
|
yield from update_ui(chatbot=chatbot, history=final_results) |
|
|
|
|
|
@CatchException |
|
def 批量总结PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): |
|
import glob, os |
|
|
|
|
|
chatbot.append([ |
|
"函数插件功能?", |
|
"批量总结PDF文档。函数插件贡献者: ValeriaWong,Eralien"]) |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
|
|
|
|
try: |
|
import fitz |
|
except: |
|
report_exception(chatbot, history, |
|
a = f"解析项目: {txt}", |
|
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
history = [] |
|
|
|
|
|
if os.path.exists(txt): |
|
project_folder = txt |
|
else: |
|
if txt == "": txt = '空空如也的输入栏' |
|
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] |
|
|
|
|
|
if len(file_manifest) == 0: |
|
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) |
|
|