WhyLIM commited on
Commit
30f8152
·
1 Parent(s): 73b530b

Upload 2 files

Browse files
Files changed (2) hide show
  1. apikey.ini +8 -0
  2. app.py +655 -0
apikey.ini ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [OpenAI]
2
+ introductioon = the api key does not ''
3
+ OPENAI_API_KEYS = [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]
4
+ [Gitee]
5
+ api = your_gitee_api
6
+ owner = your_gitee_name
7
+ repo = your_repo_name
8
+ path = files_name_in_your_repo
app.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import datetime
5
+ import arxiv
6
+ import openai, tenacity
7
+ import base64, requests
8
+ import argparse
9
+ import configparser
10
+ import fitz, io, os
11
+ from PIL import Image
12
+ import gradio
13
+ import markdown
14
+
15
+ class Paper:
16
+ def __init__(self, path, title='', url='', abs='', authers=[], sl=[]):
17
+ # 初始化函数,根据pdf路径初始化Paper对象
18
+ self.url = url # 文章链接
19
+ self.path = path # pdf路径
20
+ self.sl = sl
21
+ self.section_names = [] # 段落标题
22
+ self.section_texts = {} # 段落内容
23
+ if title == '':
24
+ self.pdf = fitz.open(self.path) # pdf文档
25
+ self.title = self.get_title()
26
+ self.parse_pdf()
27
+ else:
28
+ self.title = title
29
+ self.authers = authers
30
+ self.abs = abs
31
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
32
+ self.digit_num = [str(d+1) for d in range(10)]
33
+ self.first_image = ''
34
+
35
+ def parse_pdf(self):
36
+ self.pdf = fitz.open(self.path) # pdf文档
37
+ self.text_list = [page.get_text() for page in self.pdf]
38
+ self.all_text = ' '.join(self.text_list)
39
+ self.section_page_dict = self._get_all_page_index() # 段落与页码的对应字典
40
+ print("section_page_dict", self.section_page_dict)
41
+ self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
42
+ self.section_text_dict.update({"title": self.title})
43
+ self.pdf.close()
44
+
45
+ def get_image_path(self, image_path=''):
46
+ """
47
+ 将PDF中的第一张图保存到image.png里面,存到本地目录,返回文件名称,供gitee读取
48
+ :param filename: 图片所在路径,"C:\\Users\\Administrator\\Desktop\\nwd.pdf"
49
+ :param image_path: 图片提取后的保存路径
50
+ :return:
51
+ """
52
+ # open file
53
+ max_size = 0
54
+ image_list = []
55
+ with fitz.Document(self.path) as my_pdf_file:
56
+ # 遍历所有页面
57
+ for page_number in range(1, len(my_pdf_file) + 1):
58
+ # 查看独立页面
59
+ page = my_pdf_file[page_number - 1]
60
+ # 查看当前页所有图片
61
+ images = page.get_images()
62
+ # 遍历当前页面所有图片
63
+ for image_number, image in enumerate(page.get_images(), start=1):
64
+ # 访问图片xref
65
+ xref_value = image[0]
66
+ # 提取图片信息
67
+ base_image = my_pdf_file.extract_image(xref_value)
68
+ # 访问图片
69
+ image_bytes = base_image["image"]
70
+ # 获取图片扩展名
71
+ ext = base_image["ext"]
72
+ # 加载图片
73
+ image = Image.open(io.BytesIO(image_bytes))
74
+ image_size = image.size[0] * image.size[1]
75
+ if image_size > max_size:
76
+ max_size = image_size
77
+ image_list.append(image)
78
+ for image in image_list:
79
+ image_size = image.size[0] * image.size[1]
80
+ if image_size == max_size:
81
+ image_name = f"image.{ext}"
82
+ im_path = os.path.join(image_path, image_name)
83
+ print("im_path:", im_path)
84
+
85
+ max_pix = 480
86
+ origin_min_pix = min(image.size[0], image.size[1])
87
+
88
+ if image.size[0] > image.size[1]:
89
+ min_pix = int(image.size[1] * (max_pix/image.size[0]))
90
+ newsize = (max_pix, min_pix)
91
+ else:
92
+ min_pix = int(image.size[0] * (max_pix/image.size[1]))
93
+ newsize = (min_pix, max_pix)
94
+ image = image.resize(newsize)
95
+
96
+ image.save(open(im_path, "wb"))
97
+ return im_path, ext
98
+ return None, None
99
+
100
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
101
+ def get_chapter_names(self,):
102
+ # # 打开一个pdf文件
103
+ doc = fitz.open(self.path) # pdf文档
104
+ text_list = [page.get_text() for page in doc]
105
+ all_text = ''
106
+ for text in text_list:
107
+ all_text += text
108
+ # # 创建一个空列表,用于存储章节名称
109
+ chapter_names = []
110
+ for line in all_text.split('\n'):
111
+ line_list = line.split(' ')
112
+ if '.' in line:
113
+ point_split_list = line.split('.')
114
+ space_split_list = line.split(' ')
115
+ if 1 < len(space_split_list) < 5:
116
+ if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
117
+ print("line:", line)
118
+ chapter_names.append(line)
119
+
120
+ return chapter_names
121
+
122
+ def get_title(self):
123
+ doc = self.pdf # 打开pdf文件
124
+ max_font_size = 0 # 初始化最大字体大小为0
125
+ max_string = "" # 初始化最大字体大小对应的字符串为空
126
+ max_font_sizes = [0]
127
+ for page in doc: # 遍历每一页
128
+ text = page.get_text("dict") # 获取页面上的文本信息
129
+ blocks = text["blocks"] # 获取文本块列表
130
+ for block in blocks: # 遍历每个文本块
131
+ if block["type"] == 0: # 如果是文字类型
132
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
133
+ max_font_sizes.append(font_size)
134
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
135
+ max_font_size = font_size # 更新最大值
136
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
137
+ max_font_sizes.sort()
138
+ print("max_font_sizes", max_font_sizes[-10:])
139
+ cur_title = ''
140
+ for page in doc: # 遍历每一页
141
+ text = page.get_text("dict") # 获取页面上的文本信息
142
+ blocks = text["blocks"] # 获取文本块列表
143
+ for block in blocks: # 遍历每个文本块
144
+ if block["type"] == 0: # 如果是文字类型
145
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
146
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
147
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
148
+ # print(font_size)
149
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
150
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
151
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
152
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
153
+ if cur_title == '' :
154
+ cur_title += cur_string
155
+ else:
156
+ cur_title += ' ' + cur_string
157
+ # break
158
+ title = cur_title.replace('\n', ' ')
159
+ return title
160
+
161
+ def _get_all_page_index(self):
162
+ # 定义需要寻找的章节名称列表
163
+ section_list = self.sl
164
+ # 初始化一个字典来存储找到的章节和它们在文档中出现的页码
165
+ section_page_dict = {}
166
+ # 遍历每一页文档
167
+ for page_index, page in enumerate(self.pdf):
168
+ # 获取当前页面的文本内容
169
+ cur_text = page.get_text()
170
+ # 遍历需要寻找的章节名称列表
171
+ for section_name in section_list:
172
+ # 将章节名称转换成大写形式
173
+ section_name_upper = section_name.upper()
174
+ # 如果当前页面包含"Abstract"这个关键词
175
+ if "Abstract" == section_name and section_name in cur_text:
176
+ # 将"Abstract"和它所在的页码加入字典中
177
+ section_page_dict[section_name] = page_index
178
+ # 如果当前页面包含章节名称,则将章节名称和它所在的页码加入字典中
179
+ else:
180
+ if section_name + '\n' in cur_text:
181
+ section_page_dict[section_name] = page_index
182
+ elif section_name_upper + '\n' in cur_text:
183
+ section_page_dict[section_name] = page_index
184
+ # 返回所有找到的章节名称及它们在文档中出现的页码
185
+ return section_page_dict
186
+
187
+ def _get_all_page(self):
188
+ """
189
+ 获取PDF文件中每个页面的文本信息,并将文本信息按照章节组织成字典返回。
190
+ Returns:
191
+ section_dict (dict): 每个章节的文本信息字典,key为章节名,value为章节文本。
192
+ """
193
+ text = ''
194
+ text_list = []
195
+ section_dict = {}
196
+
197
+ # # 先处理Abstract章节
198
+ # for page_index, page in enumerate(self.pdf):
199
+ # cur_text = page.get_text()
200
+ # # 如果该页面是Abstract章节所在页面
201
+ # if page_index == list(self.section_page_dict.values())[0]:
202
+ # abs_str = "Abstract"
203
+ # # 获取Abstract章节的起始位置
204
+ # first_index = cur_text.find(abs_str)
205
+ # # 查找下一个章节的关键词,这里是Introduction
206
+ # intro_str = "Introduction"
207
+ # if intro_str in cur_text:
208
+ # second_index = cur_text.find(intro_str)
209
+ # elif intro_str.upper() in cur_text:
210
+ # second_index = cur_text.find(intro_str.upper())
211
+ # # 将Abstract章节内容加入字典中
212
+ # section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
213
+ # '').replace('\n', ' ').split('I.')[0].split("II.")[0]
214
+
215
+ # 再处理其他章节:
216
+ text_list = [page.get_text() for page in self.pdf]
217
+ for sec_index, sec_name in enumerate(self.section_page_dict):
218
+ print(sec_index, sec_name, self.section_page_dict[sec_name])
219
+ if sec_index <= 0:
220
+ continue
221
+ else:
222
+ # 直接考虑后面的内容:
223
+ start_page = self.section_page_dict[sec_name]
224
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
225
+ end_page = self.section_page_dict[list(self.section_page_dict.keys())[sec_index+1]]
226
+ else:
227
+ end_page = len(text_list)
228
+ print("start_page, end_page:", start_page, end_page)
229
+ cur_sec_text = ''
230
+ if end_page - start_page == 0:
231
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
232
+ next_sec = list(self.section_page_dict.keys())[sec_index+1]
233
+ if text_list[start_page].find(sec_name) == -1:
234
+ start_i = text_list[start_page].find(sec_name.upper())
235
+ else:
236
+ start_i = text_list[start_page].find(sec_name)
237
+ if text_list[start_page].find(next_sec) == -1:
238
+ end_i = text_list[start_page].find(next_sec.upper())
239
+ else:
240
+ end_i = text_list[start_page].find(next_sec)
241
+ cur_sec_text += text_list[start_page][start_i:end_i]
242
+ else:
243
+ for page_i in range(start_page, end_page):
244
+ # print("page_i:", page_i)
245
+ if page_i == start_page:
246
+ if text_list[start_page].find(sec_name) == -1:
247
+ start_i = text_list[start_page].find(sec_name.upper())
248
+ else:
249
+ start_i = text_list[start_page].find(sec_name)
250
+ cur_sec_text += text_list[page_i][start_i:]
251
+ elif page_i < end_page:
252
+ cur_sec_text += text_list[page_i]
253
+ elif page_i == end_page:
254
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
255
+ next_sec = list(self.section_page_dict.keys())[sec_index+1]
256
+ if text_list[start_page].find(next_sec) == -1:
257
+ end_i = text_list[start_page].find(next_sec.upper())
258
+ else:
259
+ end_i = text_list[start_page].find(next_sec)
260
+ cur_sec_text += text_list[page_i][:end_i]
261
+ section_dict[sec_name] = cur_sec_text.replace('-\n', '').replace('\n', ' ')
262
+ return section_dict
263
+
264
+ # 定义Reader类
265
+ class Reader:
266
+ # 初始化方法,设置属性
267
+ def __init__(self, key_word='', query='', filter_keys='',
268
+ root_path='./',
269
+ gitee_key='',
270
+ sort=arxiv.SortCriterion.SubmittedDate, user_name='defualt', language='cn'):
271
+ self.user_name = user_name # 读者姓名
272
+ self.key_word = key_word # 读者感兴趣的关键词
273
+ self.query = query # 读者输入的搜索查询
274
+ self.sort = sort # 读者选择的排序方式
275
+ self.language = language # 读者选择的语言
276
+ self.filter_keys = filter_keys # 用于在摘要中筛选的关键词
277
+ self.root_path = root_path
278
+ # 创建一个ConfigParser对象
279
+ self.config = configparser.ConfigParser()
280
+ # 读取配置文件
281
+ self.config.read('apikey.ini')
282
+ # 获取某个键对应的值
283
+ self.chat_api_list = self.config.get('OpenAI', 'OPENAI_API_KEYS')[1:-1].replace('\'', '').split(',')
284
+ self.chat_api_list = [api.strip() for api in self.chat_api_list if len(api) > 5]
285
+ self.cur_api = 0
286
+ self.file_format = 'md' # or 'txt',如果为图片,则必须为'md'
287
+ self.save_image = False
288
+ if self.save_image:
289
+ self.gitee_key = self.config.get('Gitee', 'api')
290
+ else:
291
+ self.gitee_key = ''
292
+
293
+ def get_arxiv(self, max_results=30):
294
+ search = arxiv.Search(query=self.query,
295
+ max_results=max_results,
296
+ sort_by=self.sort,
297
+ sort_order=arxiv.SortOrder.Descending,
298
+ )
299
+ return search
300
+
301
+ def filter_arxiv(self, max_results=30):
302
+ search = self.get_arxiv(max_results=max_results)
303
+ print("all search:")
304
+ for index, result in enumerate(search.results()):
305
+ print(index, result.title, result.updated)
306
+
307
+ filter_results = []
308
+ filter_keys = self.filter_keys
309
+
310
+ print("filter_keys:", self.filter_keys)
311
+ # 确保每个关键词都能在摘要中找到,才算是目标论文
312
+ for index, result in enumerate(search.results()):
313
+ abs_text = result.summary.replace('-\n', '-').replace('\n', ' ')
314
+ meet_num = 0
315
+ for f_key in filter_keys.split(" "):
316
+ if f_key.lower() in abs_text.lower():
317
+ meet_num += 1
318
+ if meet_num == len(filter_keys.split(" ")):
319
+ filter_results.append(result)
320
+ # break
321
+ print("filter_results:", len(filter_results))
322
+ print("filter_papers:")
323
+ for index, result in enumerate(filter_results):
324
+ print(index, result.title, result.updated)
325
+ return filter_results
326
+
327
+ def validateTitle(self, title):
328
+ # 将论文的乱七八糟的路径格式修正
329
+ rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
330
+ new_title = re.sub(rstr, "_", title) # 替换为下划线
331
+ return new_title
332
+
333
+ def download_pdf(self, filter_results):
334
+ # 先创建文件夹
335
+ date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
336
+ key_word = str(self.key_word.replace(':', ' '))
337
+ path = self.root_path + 'pdf_files/' + self.query.replace('au: ', '').replace('title: ', '').replace('ti: ', '').replace(':', ' ')[:25] + '-' + date_str
338
+ try:
339
+ os.makedirs(path)
340
+ except:
341
+ pass
342
+ print("All_paper:", len(filter_results))
343
+ # 开始下载:
344
+ paper_list = []
345
+ for r_index, result in enumerate(filter_results):
346
+ try:
347
+ title_str = self.validateTitle(result.title)
348
+ pdf_name = title_str+'.pdf'
349
+ # result.download_pdf(path, filename=pdf_name)
350
+ self.try_download_pdf(result, path, pdf_name)
351
+ paper_path = os.path.join(path, pdf_name)
352
+ print("paper_path:", paper_path)
353
+ paper = Paper(path=paper_path,
354
+ url=result.entry_id,
355
+ title=result.title,
356
+ abs=result.summary.replace('-\n', '-').replace('\n', ' '),
357
+ authers=[str(aut) for aut in result.authors],
358
+ )
359
+ # 下载完毕,开始解析:
360
+ paper.parse_pdf()
361
+ paper_list.append(paper)
362
+ except Exception as e:
363
+ print("download_error:", e)
364
+ pass
365
+ return paper_list
366
+
367
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
368
+ stop=tenacity.stop_after_attempt(5),
369
+ reraise=True)
370
+ def try_download_pdf(self, result, path, pdf_name):
371
+ result.download_pdf(path, filename=pdf_name)
372
+
373
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
374
+ stop=tenacity.stop_after_attempt(5),
375
+ reraise=True)
376
+ def upload_gitee(self, image_path, image_name='', ext='png'):
377
+ """
378
+ 上传到码云
379
+ :return:
380
+ """
381
+ with open(image_path, 'rb') as f:
382
+ base64_data = base64.b64encode(f.read())
383
+ base64_content = base64_data.decode()
384
+
385
+ date_str = str(datetime.datetime.now())[:19].replace(':', '-').replace(' ', '-') + '.' + ext
386
+ path = image_name+ '-' +date_str
387
+
388
+ payload = {
389
+ "access_token": self.gitee_key,
390
+ "owner": self.config.get('Gitee', 'owner'),
391
+ "repo": self.config.get('Gitee', 'repo'),
392
+ "path": self.config.get('Gitee', 'path'),
393
+ "content": base64_content,
394
+ "message": "upload image"
395
+ }
396
+ # 这里需要修改成你的gitee的账户和仓库名,以及文件夹的名字:
397
+ url = f'https://gitee.com/api/v5/repos/'+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/'+path
398
+ rep = requests.post(url, json=payload).json()
399
+ print("rep:", rep)
400
+ if 'content' in rep.keys():
401
+ image_url = rep['content']['download_url']
402
+ else:
403
+ image_url = r"https://gitee.com/api/v5/repos/"+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/' + path
404
+
405
+ return image_url
406
+
407
+ def summary_with_chat(self, paper_list):
408
+ htmls = []
409
+ for paper_index, paper in enumerate(paper_list):
410
+ # 第一步先用title,abs,和introduction进行总结。
411
+ text = ''
412
+ text += 'Title:' + paper.title
413
+ text += 'Url:' + paper.url
414
+ text += 'Abstrat:' + paper.abs
415
+ # intro
416
+ text += list(paper.section_text_dict.values())[0]
417
+ max_token = 2500 * 4
418
+ text = text[:max_token]
419
+ chat_summary_text = self.chat_summary(text=text)
420
+ htmls.append(chat_summary_text)
421
+
422
+ # TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
423
+ first_image, ext = paper.get_image_path()
424
+ if first_image is None or self.gitee_key == '':
425
+ pass
426
+ else:
427
+ image_title = self.validateTitle(paper.title)
428
+ image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
429
+ htmls.append("\n")
430
+ htmls.append("![Fig]("+image_url+")")
431
+ htmls.append("\n")
432
+ # 第二步总结方法:
433
+ # TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
434
+ method_key = ''
435
+ for parse_key in paper.section_text_dict.keys():
436
+ if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
437
+ method_key = parse_key
438
+ break
439
+
440
+ if method_key != '':
441
+ text = ''
442
+ method_text = ''
443
+ summary_text = ''
444
+ summary_text += "<summary>" + chat_summary_text
445
+ # methods
446
+ method_text += paper.section_text_dict[method_key]
447
+ # TODO 把这个变成tenacity的自动判别!
448
+ max_token = 2500 * 4
449
+ text = summary_text + "\n <Methods>:\n" + method_text
450
+ text = text[:max_token]
451
+ chat_method_text = self.chat_method(text=text)
452
+ htmls.append(chat_method_text)
453
+ else:
454
+ chat_method_text = ''
455
+ htmls.append("\n")
456
+
457
+ # 第三步总结全文,并打分:
458
+ conclusion_key = ''
459
+ for parse_key in paper.section_text_dict.keys():
460
+ if 'conclu' in parse_key.lower():
461
+ conclusion_key = parse_key
462
+ break
463
+
464
+ text = ''
465
+ conclusion_text = ''
466
+ summary_text = ''
467
+ summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
468
+ if conclusion_key != '':
469
+ # conclusion
470
+ conclusion_text += paper.section_text_dict[conclusion_key]
471
+ max_token = 2500 * 4
472
+ text = summary_text + "\n <Conclusion>:\n" + conclusion_text
473
+ else:
474
+ text = summary_text
475
+ text = text[:max_token]
476
+ chat_conclusion_text = self.chat_conclusion(text=text)
477
+ htmls.append(chat_conclusion_text)
478
+ htmls.append("\n")
479
+ md_text = "\n".join(htmls)
480
+
481
+ return markdown.markdown(md_text)
482
+ # # 整合成一个文件,打包保存下来。
483
+ '''
484
+ date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
485
+ try:
486
+ export_path = os.path.join(self.root_path, 'export')
487
+ os.makedirs(export_path)
488
+ except:
489
+ pass
490
+ mode = 'w' if paper_index == 0 else 'a'
491
+ file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)[:25]+"."+self.file_format)
492
+ self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
493
+ htmls = []
494
+ '''
495
+ # file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+".md")
496
+ # self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
497
+
498
+
499
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
500
+ stop=tenacity.stop_after_attempt(5),
501
+ reraise=True)
502
+ def chat_conclusion(self, text):
503
+ openai.api_key = self.chat_api_list[self.cur_api]
504
+ self.cur_api += 1
505
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
506
+ response = openai.ChatCompletion.create(
507
+ model="gpt-3.5-turbo",
508
+ # prompt需要用英语替换,少占用token。
509
+ messages=[
510
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的审稿人,你需要严格评审这篇文章"}, # chatgpt 角色
511
+ {"role": "assistant", "content": "这是一篇英文文献的<summary>和<conclusion>部分内容,其中<summary>你已经总结好了,但是<conclusion>部分,我需要你帮忙归纳下面问题:"+text}, # 背景知识,可以参考OpenReview的审稿流程
512
+ {"role": "user", "content": """
513
+ 8. 做出如下总结:
514
+ - (1):这篇工作的意义如何?
515
+ - (2):从创新点、性能、工作量这三个维度,总结这篇文章的优点和缺点。
516
+ .......
517
+ 按照后面的格式输出:
518
+ 8. Conclusion:
519
+ - (1):xxx;
520
+ - (2):创新点: xxx; 性能: xxx; 工作量: xxx;
521
+
522
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,.......代表按照实际需求填写,如果没有可以不用写.
523
+ """},
524
+ ]
525
+ )
526
+ result = ''
527
+ for choice in response.choices:
528
+ result += choice.message.content
529
+ print("conclusion_result:\n", result)
530
+ return result
531
+
532
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
533
+ stop=tenacity.stop_after_attempt(5),
534
+ reraise=True)
535
+ def chat_method(self, text):
536
+ openai.api_key = self.chat_api_list[self.cur_api]
537
+ self.cur_api += 1
538
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
539
+ response = openai.ChatCompletion.create(
540
+ model="gpt-3.5-turbo",
541
+ messages=[
542
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
543
+ {"role": "assistant", "content": "这是一篇英文文献的<summary>和<Method>部分内容,其中<summary>你已经总结好了,但是<Methods>部分,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
544
+ {"role": "user", "content": """
545
+ 7. 详细描述这篇文章的方法思路。比如说它的步骤是:
546
+ - (1):...
547
+ - (2):...
548
+ - (3):...
549
+ - .......
550
+ 按照后面的格式输出:
551
+ 7. Methods:
552
+ - (1):xxx;
553
+ - (2):xxx;
554
+ - (3):xxx;
555
+ .......
556
+
557
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行,.......代表按照实际需求填写,如果没有可以不用写.
558
+ """},
559
+ ]
560
+ )
561
+ result = ''
562
+ for choice in response.choices:
563
+ result += choice.message.content
564
+ print("method_result:\n", result)
565
+ return result
566
+
567
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
568
+ stop=tenacity.stop_after_attempt(5),
569
+ reraise=True)
570
+ def chat_summary(self, text):
571
+ openai.api_key = self.chat_api_list[self.cur_api]
572
+ self.cur_api += 1
573
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
574
+
575
+ response = openai.ChatCompletion.create(
576
+ model="gpt-3.5-turbo",
577
+ messages=[
578
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
579
+ {"role": "assistant", "content": "这是一篇英文文献的标题,作者,链接,Abstract和Introduction部分内容,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
580
+ {"role": "user", "content": """
581
+ 1. 标记出这篇文献的标题(加上中文翻译)
582
+ 2. 列举所有的作者姓名 (使用英文)
583
+ 3. 标记第一作者的单位(只输出中文翻译)
584
+ 4. 标记出这篇文章的关键词(使用英文)
585
+ 5. 论文链接,Github代码链接(如果有的话,没有的话请填写Github:None)
586
+ 6. 按照下面四个点进行总结:
587
+ - (1):这篇文���的研究背景是什么?
588
+ - (2):过去的方法有哪些?它们存在什么问题?本文和过去的研究有哪些本质的区别?Is the approach well motivated?
589
+ - (3):本文提出的研究方法是什么?
590
+ - (4):本文方法在什么任务上,取得了什么性能?性能能否支持他们的目标?
591
+ 按照后面的格式输出:
592
+ 1. Title: xxx
593
+ 2. Authors: xxx
594
+ 3. Affiliation: xxx
595
+ 4. Keywords: xxx
596
+ 5. Urls: xxx or xxx , xxx
597
+ 6. Summary:
598
+ - (1):xxx;
599
+ - (2):xxx;
600
+ - (3):xxx;
601
+ - (4):xxx.
602
+
603
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要有太多重复的信息,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行.
604
+ """},
605
+ ]
606
+ )
607
+ result = ''
608
+ for choice in response.choices:
609
+ result += choice.message.content
610
+ print("summary_result:\n", result)
611
+ return result
612
+
613
+ def export_to_markdown(self, text, file_name, mode='w'):
614
+ # 使用markdown模块的convert方法,将文本转换为html格式
615
+ # html = markdown.markdown(text)
616
+ # 打开一个文件,以写入模式
617
+ with open(file_name, mode, encoding="utf-8") as f:
618
+ # 将html格式的内容写入文件
619
+ f.write(text)
620
+
621
+ # 定义一个方法,打印出读者信息
622
+ def show_info(self):
623
+ print(f"Key word: {self.key_word}")
624
+ print(f"Query: {self.query}")
625
+ print(f"Sort: {self.sort}")
626
+
627
+ def upload_pdf(text, file):
628
+ # 检查两个输入都不为空
629
+ if not text or not file:
630
+ return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
631
+ # 判断PDF文件
632
+ if file and file.name.split(".")[-1].lower() != "pdf":
633
+ return '请勿上传非 PDF 文件!'
634
+ else:
635
+ section_list = text.split(',')
636
+ paper_list = [Paper(path=file, sl=section_list)]
637
+ # 创建一个Reader对象
638
+ reader = Reader()
639
+ sum_info = reader.summary_with_chat(paper_list=paper_list)
640
+ return sum_info
641
+
642
+ # 标题
643
+ title = "ChatPaper"
644
+ # 描述
645
+ description = "<div align='center'>帮助您快速阅读论文</div>"
646
+ # 创建Gradio界面
647
+ ip = [
648
+ gradio.inputs.Textbox(label="请输入论文大标题索引,(用【,】隔开)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
649
+ gradio.inputs.File(label="上传论文(必须为PDF)")
650
+ ]
651
+
652
+ interface = gradio.Interface(fn=upload_pdf, inputs=ip, outputs="html", title=title, description=description)
653
+
654
+ # 运行Gradio应用程序
655
+ interface.launch()