Spaces:
Running
Running
| from toolbox import update_ui | |
| from toolbox import CatchException, report_execption, write_results_to_file, get_conf | |
| import re, requests, unicodedata, os | |
| from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive | |
| def download_arxiv_(url_pdf): | |
| if 'arxiv.org' not in url_pdf: | |
| if ('.' in url_pdf) and ('/' not in url_pdf): | |
| new_url = 'https://arxiv.org/abs/'+url_pdf | |
| print('下载编号:', url_pdf, '自动定位:', new_url) | |
| # download_arxiv_(new_url) | |
| return download_arxiv_(new_url) | |
| else: | |
| print('不能识别的URL!') | |
| return None | |
| if 'abs' in url_pdf: | |
| url_pdf = url_pdf.replace('abs', 'pdf') | |
| url_pdf = url_pdf + '.pdf' | |
| url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs') | |
| title, other_info = get_name(_url_=url_abs) | |
| paper_id = title.split()[0] # '[1712.00559]' | |
| if '2' in other_info['year']: | |
| title = other_info['year'] + ' ' + title | |
| known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI'] | |
| for k in known_conf: | |
| if k in other_info['comment']: | |
| title = k + ' ' + title | |
| download_dir = './gpt_log/arxiv/' | |
| os.makedirs(download_dir, exist_ok=True) | |
| title_str = title.replace('?', '?')\ | |
| .replace(':', ':')\ | |
| .replace('\"', '“')\ | |
| .replace('\n', '')\ | |
| .replace(' ', ' ')\ | |
| .replace(' ', ' ') | |
| requests_pdf_url = url_pdf | |
| file_path = download_dir+title_str | |
| # if os.path.exists(file_path): | |
| # print('返回缓存文件') | |
| # return './gpt_log/arxiv/'+title_str | |
| print('下载中') | |
| proxies, = get_conf('proxies') | |
| r = requests.get(requests_pdf_url, proxies=proxies) | |
| with open(file_path, 'wb+') as f: | |
| f.write(r.content) | |
| print('下载完成') | |
| # print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf)) | |
| # subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True) | |
| x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors']) | |
| x = x.replace('?', '?')\ | |
| .replace(':', ':')\ | |
| .replace('\"', '“')\ | |
| .replace('\n', '')\ | |
| .replace(' ', ' ')\ | |
| .replace(' ', ' ') | |
| return './gpt_log/arxiv/'+title_str, other_info | |
| def get_name(_url_): | |
| import os | |
| from bs4 import BeautifulSoup | |
| print('正在获取文献名!') | |
| print(_url_) | |
| # arxiv_recall = {} | |
| # if os.path.exists('./arxiv_recall.pkl'): | |
| # with open('./arxiv_recall.pkl', 'rb') as f: | |
| # arxiv_recall = pickle.load(f) | |
| # if _url_ in arxiv_recall: | |
| # print('在缓存中') | |
| # return arxiv_recall[_url_] | |
| proxies, = get_conf('proxies') | |
| res = requests.get(_url_, proxies=proxies) | |
| bs = BeautifulSoup(res.text, 'html.parser') | |
| other_details = {} | |
| # get year | |
| try: | |
| year = bs.find_all(class_='dateline')[0].text | |
| year = re.search(r'(\d{4})', year, re.M | re.I).group(1) | |
| other_details['year'] = year | |
| abstract = bs.find_all(class_='abstract mathjax')[0].text | |
| other_details['abstract'] = abstract | |
| except: | |
| other_details['year'] = '' | |
| print('年份获取失败') | |
| # get author | |
| try: | |
| authors = bs.find_all(class_='authors')[0].text | |
| authors = authors.split('Authors:')[1] | |
| other_details['authors'] = authors | |
| except: | |
| other_details['authors'] = '' | |
| print('authors获取失败') | |
| # get comment | |
| try: | |
| comment = bs.find_all(class_='metatable')[0].text | |
| real_comment = None | |
| for item in comment.replace('\n', ' ').split(' '): | |
| if 'Comments' in item: | |
| real_comment = item | |
| if real_comment is not None: | |
| other_details['comment'] = real_comment | |
| else: | |
| other_details['comment'] = '' | |
| except: | |
| other_details['comment'] = '' | |
| print('年份获取失败') | |
| title_str = BeautifulSoup( | |
| res.text, 'html.parser').find('title').contents[0] | |
| print('获取成功:', title_str) | |
| # arxiv_recall[_url_] = (title_str+'.pdf', other_details) | |
| # with open('./arxiv_recall.pkl', 'wb') as f: | |
| # pickle.dump(arxiv_recall, f) | |
| return title_str+'.pdf', other_details | |
| def 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): | |
| CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,函数插件作者[binary-husky]。正在提取摘要并下载PDF文档……" | |
| import glob | |
| import os | |
| # 基本信息:功能、贡献者 | |
| chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO]) | |
| yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 | |
| # 尝试导入依赖,如果缺少依赖,则给出安装建议 | |
| try: | |
| import bs4 | |
| except: | |
| report_execption(chatbot, history, | |
| a = f"解析项目: {txt}", | |
| b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade beautifulsoup4```。") | |
| yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 | |
| return | |
| # 清空历史,以免输入溢出 | |
| history = [] | |
| # 提取摘要,下载PDF文档 | |
| try: | |
| pdf_path, info = download_arxiv_(txt) | |
| except: | |
| report_execption(chatbot, history, | |
| a = f"解析项目: {txt}", | |
| b = f"下载pdf文件未成功") | |
| yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 | |
| return | |
| # 翻译摘要等 | |
| i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}" | |
| i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}' | |
| chatbot.append((i_say_show_user, "[Local Message] waiting gpt response.")) | |
| yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 | |
| msg = '正常' | |
| # ** gpt request ** | |
| # 单线,获取文章meta信息 | |
| gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( | |
| inputs=i_say, | |
| inputs_show_user=i_say_show_user, | |
| llm_kwargs=llm_kwargs, | |
| chatbot=chatbot, history=[], | |
| sys_prompt="Your job is to collect information from materials and translate to Chinese。", | |
| ) | |
| chatbot[-1] = (i_say_show_user, gpt_say) | |
| history.append(i_say_show_user); history.append(gpt_say) | |
| yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面 | |
| # 写入文件 | |
| import shutil | |
| # 重置文件的创建时间 | |
| shutil.copyfile(pdf_path, f'./gpt_log/{os.path.basename(pdf_path)}'); os.remove(pdf_path) | |
| res = write_results_to_file(history) | |
| chatbot.append(("完成了吗?", res + "\n\nPDF文件也已经下载")) | |
| yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面 | |