ShiwenNi commited on
Commit
8ddfa3b
1 Parent(s): d3b3756

Upload 2 files

Browse files
Files changed (2) hide show
  1. chat_reviewer.py +183 -0
  2. get_paper_from_pdf.py +182 -0
chat_reviewer.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import datetime
5
+ import time
6
+ import openai, tenacity
7
+ import argparse
8
+ import configparser
9
+ import json
10
+ import tiktoken
11
+ from get_paper_from_pdf import Paper
12
+
13
+ # 定义Reviewer类
14
+ class Reviewer:
15
+ # 初始化方法,设置属性
16
+ def __init__(self, args=None):
17
+ if args.language == 'en':
18
+ self.language = 'English'
19
+ elif args.language == 'zh':
20
+ self.language = 'Chinese'
21
+ else:
22
+ self.language = 'Chinese'
23
+ # 创建一个ConfigParser对象
24
+ self.config = configparser.ConfigParser()
25
+ # 读取配置文件
26
+ self.config.read('apikey.ini')
27
+ # 获取某个键对应的值
28
+ self.chat_api_list = self.config.get('OpenAI', 'OPENAI_API_KEYS')[1:-1].replace('\'', '').split(',')
29
+ self.chat_api_list = [api.strip() for api in self.chat_api_list if len(api) > 5]
30
+ self.cur_api = 0
31
+ self.file_format = args.file_format
32
+ self.max_token_num = 4096
33
+ self.encoding = tiktoken.get_encoding("gpt2")
34
+
35
+ def validateTitle(self, title):
36
+ # 修正论文的路径格式
37
+ rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
38
+ new_title = re.sub(rstr, "_", title) # 替换为下划线
39
+ return new_title
40
+
41
+
42
+ def review_by_chatgpt(self, paper_list):
43
+ htmls = []
44
+ for paper_index, paper in enumerate(paper_list):
45
+ sections_of_interest = self.stage_1(paper)
46
+ # extract the essential parts of the paper
47
+ text = ''
48
+ text += 'Title:' + paper.title + '. '
49
+ text += 'Abstract: ' + paper.section_texts['Abstract']
50
+ intro_title = next((item for item in paper.section_names if 'ntroduction' in item), None)
51
+ if intro_title is not None:
52
+ text += 'Introduction: ' + paper.section_texts[intro_title]
53
+ # Similar for conclusion section
54
+ conclusion_title = next((item for item in paper.section_names if 'onclusion' in item), None)
55
+ if conclusion_title is not None:
56
+ text += 'Conclusion: ' + paper.section_texts[conclusion_title]
57
+ for heading in sections_of_interest:
58
+ if heading in paper.section_names:
59
+ text += heading + ': ' + paper.section_texts[heading]
60
+ chat_review_text = self.chat_review(text=text)
61
+ htmls.append('## Paper:' + str(paper_index+1))
62
+ htmls.append('\n\n\n')
63
+ htmls.append(chat_review_text)
64
+
65
+ # 将审稿意见保存起来
66
+ date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
67
+ try:
68
+ export_path = os.path.join('./', 'output_file')
69
+ os.makedirs(export_path)
70
+ except:
71
+ pass
72
+ mode = 'w' if paper_index == 0 else 'a'
73
+ file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+"."+self.file_format)
74
+ self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
75
+ htmls = []
76
+
77
+
78
+ def stage_1(self, paper):
79
+ htmls = []
80
+ text = ''
81
+ text += 'Title: ' + paper.title + '. '
82
+ text += 'Abstract: ' + paper.section_texts['Abstract']
83
+ openai.api_key = self.chat_api_list[self.cur_api]
84
+ self.cur_api += 1
85
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
86
+ messages = [
87
+ {"role": "system",
88
+ "content": f"You are a professional reviewer in the field of {args.research_fields}. "
89
+ f"I will give you a paper. You need to review this paper and discuss the novelty and originality of ideas, correctness, clarity, the significance of results, potential impact and quality of the presentation. "
90
+ f"Due to the length limitations, I am only allowed to provide you the abstract, introduction, conclusion and at most two sections of this paper."
91
+ f"Now I will give you the title and abstract and the headings of potential sections. "
92
+ f"You need to reply at most two headings. Then I will further provide you the full information, includes aforementioned sections and at most two sections you called for.\n\n"
93
+ f"Title: {paper.title}\n\n"
94
+ f"Abstract: {paper.section_texts['Abstract']}\n\n"
95
+ f"Potential Sections: {paper.section_names[2:-1]}\n\n"
96
+ f"Follow the following format to output your choice of sections:"
97
+ f"{{chosen section 1}}, {{chosen section 2}}\n\n"},
98
+ {"role": "user", "content": text},
99
+ ]
100
+ response = openai.ChatCompletion.create(
101
+ model="gpt-3.5-turbo",
102
+ messages=messages,
103
+ )
104
+ result = ''
105
+ for choice in response.choices:
106
+ result += choice.message.content
107
+ print(result)
108
+ return result.split(',')
109
+
110
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
111
+ stop=tenacity.stop_after_attempt(5),
112
+ reraise=True)
113
+ def chat_review(self, text):
114
+ openai.api_key = self.chat_api_list[self.cur_api]
115
+ self.cur_api += 1
116
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
117
+ review_prompt_token = 1000
118
+ text_token = len(self.encoding.encode(text))
119
+ input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/text_token)
120
+ input_text = "This is the paper for your review:" + text[:input_text_index]
121
+ with open('ReviewFormat.txt', 'r') as file: # 读取特定的审稿格式
122
+ review_format = file.read()
123
+ messages=[
124
+ {"role": "system", "content": "You are a professional reviewer in the field of "+args.research_fields+". Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ review_format +" Please answer in {}.".format(self.language)},
125
+ {"role": "user", "content": input_text},
126
+ ]
127
+
128
+ response = openai.ChatCompletion.create(
129
+ model="gpt-3.5-turbo",
130
+ messages=messages,
131
+ )
132
+ result = ''
133
+ for choice in response.choices:
134
+ result += choice.message.content
135
+ print("********"*10)
136
+ print(result)
137
+ print("********"*10)
138
+ print("prompt_token_used:", response.usage.prompt_tokens)
139
+ print("completion_token_used:", response.usage.completion_tokens)
140
+ print("total_token_used:", response.usage.total_tokens)
141
+ print("response_time:", response.response_ms/1000.0, 's')
142
+ return result
143
+
144
+ def export_to_markdown(self, text, file_name, mode='w'):
145
+ # 使用markdown模块的convert方法,将文本转换为html格式
146
+ # html = markdown.markdown(text)
147
+ # 打开一个文件,以写入模式
148
+ with open(file_name, mode, encoding="utf-8") as f:
149
+ # 将html格式的内容写入文件
150
+ f.write(text)
151
+
152
+ def main(args):
153
+
154
+ reviewer1 = Reviewer(args=args)
155
+ # 开始判断是路径还是文件:
156
+ paper_list = []
157
+ if args.paper_path.endswith(".pdf"):
158
+ paper_list.append(Paper(path=args.paper_path))
159
+ else:
160
+ for root, dirs, files in os.walk(args.paper_path):
161
+ print("root:", root, "dirs:", dirs, 'files:', files) #当前目录路径
162
+ for filename in files:
163
+ # 如果找到PDF文件,则将其复制到目标文件夹中
164
+ if filename.endswith(".pdf"):
165
+ paper_list.append(Paper(path=os.path.join(root, filename)))
166
+ print("------------------paper_num: {}------------------".format(len(paper_list)))
167
+ [print(paper_index, paper_name.path.split('\\')[-1]) for paper_index, paper_name in enumerate(paper_list)]
168
+ reviewer1.review_by_chatgpt(paper_list=paper_list)
169
+
170
+
171
+
172
+ if __name__ == '__main__':
173
+ parser = argparse.ArgumentParser()
174
+ parser.add_argument("--paper_path", type=str, default='', help="path of papers")
175
+ parser.add_argument("--file_format", type=str, default='txt', help="output file format")
176
+ parser.add_argument("--research_fields", type=str, default='computer science, artificial intelligence and reinforcement learning', help="the research fields of paper")
177
+ parser.add_argument("--language", type=str, default='en', help="output lauguage, en or zh")
178
+
179
+ args = parser.parse_args()
180
+ start_time = time.time()
181
+ main(args=args)
182
+ print("review time:", time.time() - start_time)
183
+
get_paper_from_pdf.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz, io, os
2
+ from PIL import Image
3
+ from collections import Counter
4
+ import json
5
+ import re
6
+
7
+
8
+
9
+ class Paper:
10
+ def __init__(self, path, title='', url='', abs='', authors=[]):
11
+ # 初始化函数,根据pdf路径初始化Paper对象
12
+ self.url = url # 文章链接
13
+ self.path = path # pdf路径
14
+ self.section_names = [] # 段落标题
15
+ self.section_texts = {} # 段落内容
16
+ self.abs = abs
17
+ self.title_page = 0
18
+ if title == '':
19
+ self.pdf = fitz.open(self.path) # pdf文档
20
+ self.title = self.get_title()
21
+ self.parse_pdf()
22
+ else:
23
+ self.title = title
24
+ self.authors = authors
25
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
26
+ self.digit_num = [str(d+1) for d in range(10)]
27
+ self.first_image = ''
28
+
29
+ def parse_pdf(self):
30
+ self.pdf = fitz.open(self.path) # pdf文档
31
+ self.text_list = [page.get_text() for page in self.pdf]
32
+ self.all_text = ' '.join(self.text_list)
33
+ self.extract_section_infomation()
34
+ self.section_texts.update({"title": self.title})
35
+ self.section_texts.update({"paper_info": self.get_paper_info()})
36
+ self.pdf.close()
37
+
38
+ def get_paper_info(self):
39
+ first_page_text = self.pdf[self.title_page].get_text()
40
+ if "Abstract" in self.section_texts.keys():
41
+ abstract_text = self.section_texts['Abstract']
42
+ else:
43
+ abstract_text = self.abs
44
+ introduction_text = self.section_texts['Introduction']
45
+ first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
46
+ return first_page_text
47
+
48
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
49
+ def get_chapter_names(self,):
50
+ # # 打开一个pdf文件
51
+ doc = fitz.open(self.path) # pdf文档
52
+ text_list = [page.get_text() for page in doc]
53
+ all_text = ''
54
+ for text in text_list:
55
+ all_text += text
56
+ # # 创建一个空列表,用于存储章节名称
57
+ chapter_names = []
58
+ for line in all_text.split('\n'):
59
+ line_list = line.split(' ')
60
+ if '.' in line:
61
+ point_split_list = line.split('.')
62
+ space_split_list = line.split(' ')
63
+ if 1 < len(space_split_list) < 5:
64
+ if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
65
+ # print("line:", line)
66
+ chapter_names.append(line)
67
+
68
+ return chapter_names
69
+
70
+ def get_title(self):
71
+ doc = self.pdf # 打开pdf文件
72
+ max_font_size = 0 # 初始化最大字体大小为0
73
+ max_string = "" # 初始化最大字体大小对应的字符串为空
74
+ max_font_sizes = [0]
75
+ for page_index, page in enumerate(doc): # 遍历每一页
76
+ text = page.get_text("dict") # 获取页面上的文本信息
77
+ blocks = text["blocks"] # 获取文本块列表
78
+ for block in blocks: # 遍历每个文本块
79
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
80
+ if len(block["lines"][0]["spans"]):
81
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
82
+ max_font_sizes.append(font_size)
83
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
84
+ max_font_size = font_size # 更新最大值
85
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
86
+ max_font_sizes.sort()
87
+ # print("max_font_sizes", max_font_sizes[-10:])
88
+ cur_title = ''
89
+ for page_index, page in enumerate(doc): # 遍历每一页
90
+ text = page.get_text("dict") # 获取页面上的文本信息
91
+ blocks = text["blocks"] # 获取文本块列表
92
+ for block in blocks: # 遍历每个文本块
93
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
94
+ if len(block["lines"][0]["spans"]):
95
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
96
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
97
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
98
+ # print(font_size)
99
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
100
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
101
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
102
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
103
+ if cur_title == '':
104
+ cur_title += cur_string
105
+ else:
106
+ cur_title += ' ' + cur_string
107
+ self.title_page = page_index
108
+ # break
109
+ title = cur_title.replace('\n', ' ')
110
+ return title
111
+
112
+ def extract_section_infomation(self):
113
+ doc = fitz.open(self.path)
114
+
115
+ # 获取文档中所有字体大小
116
+ font_sizes = []
117
+ for page in doc:
118
+ blocks = page.get_text("dict")["blocks"]
119
+ for block in blocks:
120
+ if 'lines' not in block:
121
+ continue
122
+ lines = block["lines"]
123
+ for line in lines:
124
+ for span in line["spans"]:
125
+ font_sizes.append(span["size"])
126
+ most_common_size, _ = Counter(font_sizes).most_common(1)[0]
127
+
128
+ # 按照最频繁的字体大小确定标题字体大小的阈值
129
+ threshold = most_common_size * 1
130
+
131
+ section_dict = {}
132
+ last_heading = None
133
+ subheadings = []
134
+ heading_font = -1
135
+ # 遍历每一页并查找子标题
136
+ found_abstract = False
137
+ for page in doc:
138
+ blocks = page.get_text("dict")["blocks"]
139
+ for block in blocks:
140
+ if not found_abstract:
141
+ text = json.dumps(block)
142
+ if re.search(r"\bAbstract\b", text, re.IGNORECASE):
143
+ found_abstract = True
144
+ if found_abstract:
145
+ if 'lines' not in block:
146
+ continue
147
+ lines = block["lines"]
148
+ for line in lines:
149
+ for span in line["spans"]:
150
+ if span["size"] > threshold and re.match(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
151
+ span["text"].strip()):
152
+ if heading_font == -1:
153
+ heading_font = span["size"]
154
+ elif heading_font != span["size"]:
155
+ continue
156
+ heading = span["text"].strip()
157
+ if "References" in heading: # reference 以后的内容不考虑
158
+ self.section_names = subheadings
159
+ self.section_texts = section_dict
160
+ return
161
+ subheadings.append(heading)
162
+ if last_heading is not None:
163
+ section_dict[last_heading] = section_dict[last_heading].strip()
164
+ section_dict[heading] = ""
165
+ last_heading = heading
166
+ # 否则将当前文本添加到上一个子标题的文本中
167
+ elif last_heading is not None:
168
+ section_dict[last_heading] += " " + span["text"].strip()
169
+ self.section_names = subheadings
170
+ self.section_texts = section_dict
171
+
172
+ def main():
173
+ path = r'demo.pdf'
174
+ paper = Paper(path=path)
175
+ paper.parse_pdf()
176
+ # for key, value in paper.section_text_dict.items():
177
+ # print(key, value)
178
+ # print("*"*40)
179
+
180
+
181
+ if __name__ == '__main__':
182
+ main()