ShiwenNi commited on
Commit
67a900e
1 Parent(s): 10290d5

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +136 -0
  2. get_paper_from_pdf.py +193 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import datetime
5
+ import time
6
+ import openai, tenacity
7
+ import argparse
8
+ import configparser
9
+ import json
10
+ import tiktoken
11
+ from get_paper_from_pdf import Paper
12
+ import gradio
13
+
14
+ # 定义Response类
15
+ class Response:
16
+ # 初始化方法,设置属性
17
+ def __init__(self, api, comment, language):
18
+ self.api = api
19
+ self.comment = comment
20
+ self.language = language
21
+ self.max_token_num = 4096
22
+ self.encoding = tiktoken.get_encoding("gpt2")
23
+
24
+
25
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
26
+ stop=tenacity.stop_after_attempt(5),
27
+ reraise=True)
28
+ def chat_response(self, text):
29
+ openai.api_key = self.chat_api_list[self.cur_api]
30
+ self.cur_api += 1
31
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
32
+ response_prompt_token = 1000
33
+ text_token = len(self.encoding.encode(self.comment))
34
+ input_text_index = int(len(text)*(self.max_token_num-response_prompt_token)/text_token)
35
+ input_text = "This is the review comments:" + text[:input_text_index]
36
+ messages=[
37
+ {"role": "system", "content": """You are the author, you submitted a paper, and the reviewers gave the review comments.
38
+ Please reply with what we have done, not what we will do.
39
+ You need to extract questions from the review comments one by one, and then respond point-to-point to the reviewers’ concerns.
40
+ Please answer in {}. Follow the format of the output later:
41
+ - Response to reviewers
42
+ #1 reviewer
43
+ Concern #1: xxxx
44
+ Author response: xxxxx
45
+ Concern #2: xxxx
46
+ Author response: xxxxx
47
+ ...
48
+ #2 reviewer
49
+ Concern #1: xxxx
50
+ Author response: xxxxx
51
+ Concern #2: xxxx
52
+ Author response: xxxxx
53
+ ...
54
+ #3 reviewer
55
+ Concern #1: xxxx
56
+ Author response: xxxxx
57
+ Concern #2: xxxx
58
+ Author response: xxxxx
59
+ ...
60
+
61
+ """.format(self.language)
62
+
63
+ },
64
+ {"role": "user", "content": input_text},
65
+ ]
66
+
67
+ response = openai.ChatCompletion.create(
68
+ model="gpt-3.5-turbo",
69
+ messages=messages,
70
+ )
71
+ result = ''
72
+ for choice in response.choices:
73
+ result += choice.message.content
74
+ print("********"*10)
75
+ print(result)
76
+ print("********"*10)
77
+ print("prompt_token_used:", response.usage.prompt_tokens)
78
+ print("completion_token_used:", response.usage.completion_tokens)
79
+ print("total_token_used:", response.usage.total_tokens)
80
+ print("response_time:", response.response_ms/1000.0, 's')
81
+ return result, response.usage.total_tokens
82
+
83
+
84
+
85
+ def main(api, comment, language):
86
+ start_time = time.time()
87
+ if not api or not comment:
88
+ return "请输入API-key以及审稿意见!"
89
+ else:
90
+ Response1 = Response(api, comment, language)
91
+ # 开始判断是路径还是文件:
92
+ response, total_token_used = Response1.chat_response()
93
+ time_used = time.time() - start_time
94
+ output2 ="使用token数:"+ str(total_token_used)+"\n花费时间:"+ str(round(time_used, 2)) +"秒"
95
+ return response, output2
96
+
97
+
98
+ ########################################################################################################
99
+ # 标题
100
+ title = "🤖ChatResponse🤖"
101
+ # 描述
102
+
103
+ description = '''<div align='left'>
104
+ <img align='right' src='http://i.imgtg.com/2023/03/22/94PLN.png' width="250">
105
+
106
+ <strong>ChatResponse是一款根据审稿人的评论自动生成作者回复的AI助手。用途如下:</strong>其用途为:
107
+
108
+ ⭐️根据输入的审稿意见,ChatResponse会自动提取其中各个审稿人的问题和担忧,并生成点对点的回复。
109
+
110
+ 如果觉得很卡,可以点击右上角的Duplicate this Space,把ChatResponse复制到你自己的Space中!
111
+
112
+ 本项目的[Github](https://github.com/nishiwen1214/ChatResponse),欢迎Star和Fork,也欢迎大佬赞助让本项目快速成长!💗([获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/))
113
+ </div>
114
+ '''
115
+
116
+ # 创建Gradio界面
117
+ inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
118
+ default="",
119
+ type='password'),
120
+ gradio.inputs.Textbox(lines=5,
121
+ label="请输入要回复的审稿意见",
122
+ default=""
123
+ ),
124
+ gradio.inputs.Radio(choices=["English", "Chinese"],
125
+ default="English",
126
+ label="选择输��语言"),
127
+ ]
128
+
129
+ chat_Response_gui = gradio.Interface(fn=main,
130
+ inputs=inp,
131
+ outputs = [gradio.Textbox(lines=20, label="回复结果"), gradio.Textbox(lines=2, label="资源统计")],
132
+ title=title,
133
+ description=description)
134
+
135
+ # Start server
136
+ chat_Response_gui .launch(quiet=True, show_api=False)
get_paper_from_pdf.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz, io, os
2
+ from PIL import Image
3
+ from collections import Counter
4
+ import json
5
+ import re
6
+
7
+ class Paper:
8
+ def __init__(self, path, title='', url='', abs='', authors=[]):
9
+ # 初始化函数,根据pdf路径初始化Paper对象
10
+ self.url = url # 文章链接
11
+ self.path = path # pdf路径
12
+ self.section_names = [] # 段落标题
13
+ self.section_texts = {} # 段落内容
14
+ self.abs = abs
15
+ self.title_page = 0
16
+ if title == '':
17
+ self.pdf = fitz.open(self.path) # pdf文档
18
+ self.title = self.get_title()
19
+ self.parse_pdf()
20
+ else:
21
+ self.title = title
22
+ self.authors = authors
23
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
24
+ self.digit_num = [str(d + 1) for d in range(10)]
25
+ self.first_image = ''
26
+
27
+ def parse_pdf(self):
28
+ self.pdf = fitz.open(self.path) # pdf文档
29
+ self.text_list = [page.get_text() for page in self.pdf]
30
+ self.all_text = ' '.join(self.text_list)
31
+ self.extract_section_infomation()
32
+ self.section_texts.update({"title": self.title})
33
+ self.pdf.close()
34
+
35
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
36
+ def get_chapter_names(self, ):
37
+ # # 打开一个pdf文件
38
+ doc = fitz.open(self.path) # pdf文档
39
+ text_list = [page.get_text() for page in doc]
40
+ all_text = ''
41
+ for text in text_list:
42
+ all_text += text
43
+ # # 创建一个空列表,用于存储章节名称
44
+ chapter_names = []
45
+ for line in all_text.split('\n'):
46
+ line_list = line.split(' ')
47
+ if '.' in line:
48
+ point_split_list = line.split('.')
49
+ space_split_list = line.split(' ')
50
+ if 1 < len(space_split_list) < 5:
51
+ if 1 < len(point_split_list) < 5 and (
52
+ point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
53
+ # print("line:", line)
54
+ chapter_names.append(line)
55
+
56
+ return chapter_names
57
+
58
+ def get_title(self):
59
+ doc = self.pdf # 打开pdf文件
60
+ max_font_size = 0 # 初始化最大字体大小为0
61
+ max_string = "" # 初始化最大字体大小对应的字符串为空
62
+ max_font_sizes = [0]
63
+ for page_index, page in enumerate(doc): # 遍历每一页
64
+ text = page.get_text("dict") # 获取页面上的文本信息
65
+ blocks = text["blocks"] # 获取文本块列表
66
+ for block in blocks: # 遍历每个文本块
67
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
68
+ if len(block["lines"][0]["spans"]):
69
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
70
+ max_font_sizes.append(font_size)
71
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
72
+ max_font_size = font_size # 更新最大值
73
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
74
+ max_font_sizes.sort()
75
+ # print("max_font_sizes", max_font_sizes[-10:])
76
+ cur_title = ''
77
+ for page_index, page in enumerate(doc): # 遍历每一页
78
+ text = page.get_text("dict") # 获取页面上的文本信息
79
+ blocks = text["blocks"] # 获取文本块列表
80
+ for block in blocks: # 遍历每个文本块
81
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
82
+ if len(block["lines"][0]["spans"]):
83
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
84
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
85
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
86
+ # print(font_size)
87
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
88
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
89
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
90
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
91
+ if cur_title == '':
92
+ cur_title += cur_string
93
+ else:
94
+ cur_title += ' ' + cur_string
95
+ self.title_page = page_index
96
+ # break
97
+ title = cur_title.replace('\n', ' ')
98
+ return title
99
+
100
+ def extract_section_infomation(self):
101
+ doc = fitz.open(self.path)
102
+
103
+ # 获取文档中所有字体大小
104
+ font_sizes = []
105
+ for page in doc:
106
+ blocks = page.get_text("dict")["blocks"]
107
+ for block in blocks:
108
+ if 'lines' not in block:
109
+ continue
110
+ lines = block["lines"]
111
+ for line in lines:
112
+ for span in line["spans"]:
113
+ font_sizes.append(span["size"])
114
+ most_common_size, _ = Counter(font_sizes).most_common(1)[0]
115
+
116
+ # 按照最频繁的字体大小确定标题字体大小的阈值
117
+ threshold = most_common_size * 1
118
+
119
+ section_dict = {}
120
+ last_heading = None
121
+ subheadings = []
122
+ heading_font = -1
123
+ # 遍历每一页并查找子标题
124
+ found_abstract = False
125
+ upper_heading = False
126
+ font_heading = False
127
+ for page in doc:
128
+ blocks = page.get_text("dict")["blocks"]
129
+ for block in blocks:
130
+ if not found_abstract:
131
+ try:
132
+ text = json.dumps(block)
133
+ except:
134
+ continue
135
+ if re.search(r"\bAbstract\b", text, re.IGNORECASE):
136
+ found_abstract = True
137
+ last_heading = "Abstract"
138
+ section_dict["Abstract"] = ""
139
+ if found_abstract:
140
+ if 'lines' not in block:
141
+ continue
142
+ lines = block["lines"]
143
+ for line in lines:
144
+ for span in line["spans"]:
145
+ # 如果当前文本是子标题
146
+ if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
147
+ upper_heading = True
148
+ heading = span["text"].strip()
149
+ if "References" in heading: # reference 以后的内容不考虑
150
+ self.section_names = subheadings
151
+ self.section_texts = section_dict
152
+ return
153
+ subheadings.append(heading)
154
+ if last_heading is not None:
155
+ section_dict[last_heading] = section_dict[last_heading].strip()
156
+ section_dict[heading] = ""
157
+ last_heading = heading
158
+ if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
159
+ r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
160
+ span["text"].strip()):
161
+ font_heading = True
162
+ if heading_font == -1:
163
+ heading_font = span["size"]
164
+ elif heading_font != span["size"]:
165
+ continue
166
+ heading = span["text"].strip()
167
+ if "References" in heading: # reference 以后的内容不考虑
168
+ self.section_names = subheadings
169
+ self.section_texts = section_dict
170
+ return
171
+ subheadings.append(heading)
172
+ if last_heading is not None:
173
+ section_dict[last_heading] = section_dict[last_heading].strip()
174
+ section_dict[heading] = ""
175
+ last_heading = heading
176
+ # 否则将当前文本添加到上一个子标题的文本中
177
+ elif last_heading is not None:
178
+ section_dict[last_heading] += " " + span["text"].strip()
179
+ self.section_names = subheadings
180
+ self.section_texts = section_dict
181
+
182
+
183
+ def main():
184
+ path = r'demo.pdf'
185
+ paper = Paper(path=path)
186
+ paper.parse_pdf()
187
+ # for key, value in paper.section_text_dict.items():
188
+ # print(key, value)
189
+ # print("*"*40)
190
+
191
+
192
+ if __name__ == '__main__':
193
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.21.1
2
+ tiktoken==0.2.0
3
+ tenacity==8.2.2
4
+ pybase64==1.2.3
5
+ Pillow==9.4.0
6
+ openai==0.27.0
7
+ markdown
8
+ gradio==3.20.1