3v324v23 commited on
Commit
a51cfbc
1 Parent(s): d10fec8

新的arxiv论文插件

Browse files
crazy_functions/下载arxiv论文翻译摘要.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from predict import predict_no_ui
2
+ from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf
3
+ import re, requests, unicodedata, os
4
+
5
+ def download_arxiv_(url_pdf):
6
+ if 'arxiv.org' not in url_pdf:
7
+ if ('.' in url_pdf) and ('/' not in url_pdf):
8
+ new_url = 'https://arxiv.org/abs/'+url_pdf
9
+ print('下载编号:', url_pdf, '自动定位:', new_url)
10
+ # download_arxiv_(new_url)
11
+ return download_arxiv_(new_url)
12
+ else:
13
+ print('不能识别的URL!')
14
+ return None
15
+ if 'abs' in url_pdf:
16
+ url_pdf = url_pdf.replace('abs', 'pdf')
17
+ url_pdf = url_pdf + '.pdf'
18
+
19
+ url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs')
20
+ title, other_info = get_name(_url_=url_abs)
21
+
22
+ paper_id = title.split()[0] # '[1712.00559]'
23
+ if '2' in other_info['year']:
24
+ title = other_info['year'] + ' ' + title
25
+
26
+ known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI']
27
+ for k in known_conf:
28
+ if k in other_info['comment']:
29
+ title = k + ' ' + title
30
+
31
+ download_dir = './gpt_log/arxiv/'
32
+ os.makedirs(download_dir, exist_ok=True)
33
+
34
+ title_str = title.replace('?', '?')\
35
+ .replace(':', ':')\
36
+ .replace('\"', '“')\
37
+ .replace('\n', '')\
38
+ .replace(' ', ' ')\
39
+ .replace(' ', ' ')
40
+
41
+ requests_pdf_url = url_pdf
42
+ file_path = download_dir+title_str
43
+ # if os.path.exists(file_path):
44
+ # print('返回缓存文件')
45
+ # return './gpt_log/arxiv/'+title_str
46
+
47
+ print('下载中')
48
+ proxies, = get_conf('proxies')
49
+ r = requests.get(requests_pdf_url, proxies=proxies)
50
+ with open(file_path, 'wb+') as f:
51
+ f.write(r.content)
52
+ print('下载完成')
53
+
54
+ # print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
55
+ # subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)
56
+
57
+ x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
58
+ x = x.replace('?', '?')\
59
+ .replace(':', ':')\
60
+ .replace('\"', '“')\
61
+ .replace('\n', '')\
62
+ .replace(' ', ' ')\
63
+ .replace(' ', ' ')
64
+ return './gpt_log/arxiv/'+title_str, other_info
65
+
66
+
67
+ def get_name(_url_):
68
+ import os
69
+ from bs4 import BeautifulSoup
70
+ print('正在获取文献名!')
71
+ print(_url_)
72
+
73
+ # arxiv_recall = {}
74
+ # if os.path.exists('./arxiv_recall.pkl'):
75
+ # with open('./arxiv_recall.pkl', 'rb') as f:
76
+ # arxiv_recall = pickle.load(f)
77
+
78
+ # if _url_ in arxiv_recall:
79
+ # print('在缓存中')
80
+ # return arxiv_recall[_url_]
81
+
82
+ proxies, = get_conf('proxies')
83
+ res = requests.get(_url_, proxies=proxies)
84
+
85
+ bs = BeautifulSoup(res.text, 'html.parser')
86
+ other_details = {}
87
+
88
+ # get year
89
+ try:
90
+ year = bs.find_all(class_='dateline')[0].text
91
+ year = re.search(r'(\d{4})', year, re.M | re.I).group(1)
92
+ other_details['year'] = year
93
+ abstract = bs.find_all(class_='abstract mathjax')[0].text
94
+ other_details['abstract'] = abstract
95
+ except:
96
+ other_details['year'] = ''
97
+ print('年份获取失败')
98
+
99
+ # get author
100
+ try:
101
+ authors = bs.find_all(class_='authors')[0].text
102
+ authors = authors.split('Authors:')[1]
103
+ other_details['authors'] = authors
104
+ except:
105
+ other_details['authors'] = ''
106
+ print('authors获取失败')
107
+
108
+ # get comment
109
+ try:
110
+ comment = bs.find_all(class_='metatable')[0].text
111
+ real_comment = None
112
+ for item in comment.replace('\n', ' ').split(' '):
113
+ if 'Comments' in item:
114
+ real_comment = item
115
+ if real_comment is not None:
116
+ other_details['comment'] = real_comment
117
+ else:
118
+ other_details['comment'] = ''
119
+ except:
120
+ other_details['comment'] = ''
121
+ print('年份获取失败')
122
+
123
+ title_str = BeautifulSoup(
124
+ res.text, 'html.parser').find('title').contents[0]
125
+ print('获取成功:', title_str)
126
+ # arxiv_recall[_url_] = (title_str+'.pdf', other_details)
127
+ # with open('./arxiv_recall.pkl', 'wb') as f:
128
+ # pickle.dump(arxiv_recall, f)
129
+
130
+ return title_str+'.pdf', other_details
131
+
132
+
133
+
134
+ @CatchException
135
+ def 下载arxiv论文并翻译摘要(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
136
+
137
+ CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要,作者 binary-husky。正在提取摘要并下载PDF文档……"
138
+ raise RuntimeError()
139
+ import glob
140
+ import os
141
+
142
+ # 基本信息:功能、贡献者
143
+ chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO])
144
+ yield chatbot, history, '正常'
145
+
146
+ # 尝试导入依赖,如果缺少依赖,则给出安装建议
147
+ try:
148
+ import pdfminer, bs4
149
+ except:
150
+ report_execption(chatbot, history,
151
+ a = f"解析项目: {txt}",
152
+ b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
153
+ yield chatbot, history, '正常'
154
+ return
155
+
156
+ # 清空历史,以免输入溢出
157
+ history = []
158
+
159
+ # 提取摘要,下载PDF文档
160
+ try:
161
+ pdf_path, info = download_arxiv_(txt)
162
+ except:
163
+ report_execption(chatbot, history,
164
+ a = f"解析项目: {txt}",
165
+ b = f"下载pdf文件未成功")
166
+ yield chatbot, history, '正常'
167
+ return
168
+
169
+ # 翻译摘要等
170
+ i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}"
171
+ i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}'
172
+ chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
173
+ yield chatbot, history, '正常'
174
+ msg = '正常'
175
+ # ** gpt request **
176
+ gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
177
+ chatbot[-1] = (i_say_show_user, gpt_say)
178
+ history.append(i_say_show_user); history.append(gpt_say)
179
+ yield chatbot, history, msg
180
+ # 写入文件
181
+ import shutil
182
+ # 重置文件的创建时间
183
+ shutil.copyfile(pdf_path, pdf_path.replace('.pdf', '.autodownload.pdf')); os.remove(pdf_path)
184
+ res = write_results_to_file(history)
185
+ chatbot.append(("完成了吗?", res))
186
+ yield chatbot, history, msg
187
+