ShiwenNi commited on
Commit
78b3d2e
1 Parent(s): 8ddfa3b

Update get_paper_from_pdf.py

Browse files
Files changed (1) hide show
  1. get_paper_from_pdf.py +73 -61
get_paper_from_pdf.py CHANGED
@@ -5,50 +5,38 @@ import json
5
  import re
6
 
7
 
8
-
9
  class Paper:
10
  def __init__(self, path, title='', url='', abs='', authors=[]):
11
  # 初始化函数,根据pdf路径初始化Paper对象
12
- self.url = url # 文章链接
13
- self.path = path # pdf路径
14
- self.section_names = [] # 段落标题
15
- self.section_texts = {} # 段落内容
16
  self.abs = abs
17
  self.title_page = 0
18
  if title == '':
19
- self.pdf = fitz.open(self.path) # pdf文档
20
  self.title = self.get_title()
21
- self.parse_pdf()
22
  else:
23
  self.title = title
24
  self.authors = authors
25
  self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
26
- self.digit_num = [str(d+1) for d in range(10)]
27
  self.first_image = ''
28
-
29
  def parse_pdf(self):
30
- self.pdf = fitz.open(self.path) # pdf文档
31
  self.text_list = [page.get_text() for page in self.pdf]
32
  self.all_text = ' '.join(self.text_list)
33
  self.extract_section_infomation()
34
  self.section_texts.update({"title": self.title})
35
- self.section_texts.update({"paper_info": self.get_paper_info()})
36
- self.pdf.close()
37
-
38
- def get_paper_info(self):
39
- first_page_text = self.pdf[self.title_page].get_text()
40
- if "Abstract" in self.section_texts.keys():
41
- abstract_text = self.section_texts['Abstract']
42
- else:
43
- abstract_text = self.abs
44
- introduction_text = self.section_texts['Introduction']
45
- first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
46
- return first_page_text
47
-
48
  # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
49
- def get_chapter_names(self,):
50
  # # 打开一个pdf文件
51
- doc = fitz.open(self.path) # pdf文档
52
  text_list = [page.get_text() for page in doc]
53
  all_text = ''
54
  for text in text_list:
@@ -61,52 +49,53 @@ class Paper:
61
  point_split_list = line.split('.')
62
  space_split_list = line.split(' ')
63
  if 1 < len(space_split_list) < 5:
64
- if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
 
65
  # print("line:", line)
66
- chapter_names.append(line)
67
-
68
  return chapter_names
69
-
70
  def get_title(self):
71
- doc = self.pdf # 打开pdf文件
72
- max_font_size = 0 # 初始化最大字体大小为0
73
- max_string = "" # 初始化最大字体大小对应的字符串为空
74
  max_font_sizes = [0]
75
- for page_index, page in enumerate(doc): # 遍历每一页
76
- text = page.get_text("dict") # 获取页面上的文本信息
77
- blocks = text["blocks"] # 获取文本块列表
78
- for block in blocks: # 遍历每个文本块
79
- if block["type"] == 0 and len(block['lines']): # 如果是文字类型
80
  if len(block["lines"][0]["spans"]):
81
- font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
82
  max_font_sizes.append(font_size)
83
- if font_size > max_font_size: # 如果字体大小大于当前最大值
84
- max_font_size = font_size # 更新最大值
85
- max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
86
- max_font_sizes.sort()
87
  # print("max_font_sizes", max_font_sizes[-10:])
88
  cur_title = ''
89
- for page_index, page in enumerate(doc): # 遍历每一页
90
- text = page.get_text("dict") # 获取页面上的文本信息
91
- blocks = text["blocks"] # 获取文本块列表
92
- for block in blocks: # 遍历每个文本块
93
- if block["type"] == 0 and len(block['lines']): # 如果是文字类型
94
  if len(block["lines"][0]["spans"]):
95
- cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
96
- font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
97
- font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
98
  # print(font_size)
99
- if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
100
  # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
101
- if len(cur_string) > 4 and "arXiv" not in cur_string:
102
  # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
103
  if cur_title == '':
104
- cur_title += cur_string
105
  else:
106
- cur_title += ' ' + cur_string
107
  self.title_page = page_index
108
  # break
109
- title = cur_title.replace('\n', ' ')
110
  return title
111
 
112
  def extract_section_infomation(self):
@@ -134,21 +123,43 @@ class Paper:
134
  heading_font = -1
135
  # 遍历每一页并查找子标题
136
  found_abstract = False
 
 
137
  for page in doc:
138
  blocks = page.get_text("dict")["blocks"]
139
  for block in blocks:
140
  if not found_abstract:
141
- text = json.dumps(block)
 
 
 
142
  if re.search(r"\bAbstract\b", text, re.IGNORECASE):
143
  found_abstract = True
 
 
144
  if found_abstract:
145
  if 'lines' not in block:
146
  continue
147
  lines = block["lines"]
148
  for line in lines:
149
  for span in line["spans"]:
150
- if span["size"] > threshold and re.match(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
151
- span["text"].strip()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  if heading_font == -1:
153
  heading_font = span["size"]
154
  elif heading_font != span["size"]:
@@ -169,14 +180,15 @@ class Paper:
169
  self.section_names = subheadings
170
  self.section_texts = section_dict
171
 
 
172
  def main():
173
  path = r'demo.pdf'
174
  paper = Paper(path=path)
175
  paper.parse_pdf()
176
  # for key, value in paper.section_text_dict.items():
177
- # print(key, value)
178
- # print("*"*40)
179
-
180
 
181
  if __name__ == '__main__':
182
  main()
 
5
  import re
6
 
7
 
 
8
  class Paper:
9
  def __init__(self, path, title='', url='', abs='', authors=[]):
10
  # 初始化函数,根据pdf路径初始化Paper对象
11
+ self.url = url # 文章链接
12
+ self.path = path # pdf路径
13
+ self.section_names = [] # 段落标题
14
+ self.section_texts = {} # 段落内容
15
  self.abs = abs
16
  self.title_page = 0
17
  if title == '':
18
+ self.pdf = fitz.open(self.path) # pdf文档
19
  self.title = self.get_title()
20
+ self.parse_pdf()
21
  else:
22
  self.title = title
23
  self.authors = authors
24
  self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
25
+ self.digit_num = [str(d + 1) for d in range(10)]
26
  self.first_image = ''
27
+
28
  def parse_pdf(self):
29
+ self.pdf = fitz.open(self.path) # pdf文档
30
  self.text_list = [page.get_text() for page in self.pdf]
31
  self.all_text = ' '.join(self.text_list)
32
  self.extract_section_infomation()
33
  self.section_texts.update({"title": self.title})
34
+ self.pdf.close()
35
+
 
 
 
 
 
 
 
 
 
 
 
36
  # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
37
+ def get_chapter_names(self, ):
38
  # # 打开一个pdf文件
39
+ doc = fitz.open(self.path) # pdf文档
40
  text_list = [page.get_text() for page in doc]
41
  all_text = ''
42
  for text in text_list:
 
49
  point_split_list = line.split('.')
50
  space_split_list = line.split(' ')
51
  if 1 < len(space_split_list) < 5:
52
+ if 1 < len(point_split_list) < 5 and (
53
+ point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
54
  # print("line:", line)
55
+ chapter_names.append(line)
56
+
57
  return chapter_names
58
+
59
  def get_title(self):
60
+ doc = self.pdf # 打开pdf文件
61
+ max_font_size = 0 # 初始化最大字体大小为0
62
+ max_string = "" # 初始化最大字体大小对应的字符串为空
63
  max_font_sizes = [0]
64
+ for page_index, page in enumerate(doc): # 遍历每一页
65
+ text = page.get_text("dict") # 获取页面上的文本信息
66
+ blocks = text["blocks"] # 获取文本块列表
67
+ for block in blocks: # 遍历每个文本块
68
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
69
  if len(block["lines"][0]["spans"]):
70
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
71
  max_font_sizes.append(font_size)
72
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
73
+ max_font_size = font_size # 更新最大值
74
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
75
+ max_font_sizes.sort()
76
  # print("max_font_sizes", max_font_sizes[-10:])
77
  cur_title = ''
78
+ for page_index, page in enumerate(doc): # 遍历每一页
79
+ text = page.get_text("dict") # 获取页面上的文本信息
80
+ blocks = text["blocks"] # 获取文本块列表
81
+ for block in blocks: # 遍历每个文本块
82
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
83
  if len(block["lines"][0]["spans"]):
84
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
85
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
86
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
87
  # print(font_size)
88
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
89
  # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
90
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
91
  # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
92
  if cur_title == '':
93
+ cur_title += cur_string
94
  else:
95
+ cur_title += ' ' + cur_string
96
  self.title_page = page_index
97
  # break
98
+ title = cur_title.replace('\n', ' ')
99
  return title
100
 
101
  def extract_section_infomation(self):
 
123
  heading_font = -1
124
  # 遍历每一页并查找子标题
125
  found_abstract = False
126
+ upper_heading = False
127
+ font_heading = False
128
  for page in doc:
129
  blocks = page.get_text("dict")["blocks"]
130
  for block in blocks:
131
  if not found_abstract:
132
+ try:
133
+ text = json.dumps(block)
134
+ except:
135
+ continue
136
  if re.search(r"\bAbstract\b", text, re.IGNORECASE):
137
  found_abstract = True
138
+ last_heading = "Abstract"
139
+ section_dict["Abstract"] = ""
140
  if found_abstract:
141
  if 'lines' not in block:
142
  continue
143
  lines = block["lines"]
144
  for line in lines:
145
  for span in line["spans"]:
146
+ # 如果当前文本是子标题
147
+ if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
148
+ upper_heading = True
149
+ heading = span["text"].strip()
150
+ if "References" in heading: # reference 以后的内容不考虑
151
+ self.section_names = subheadings
152
+ self.section_texts = section_dict
153
+ return
154
+ subheadings.append(heading)
155
+ if last_heading is not None:
156
+ section_dict[last_heading] = section_dict[last_heading].strip()
157
+ section_dict[heading] = ""
158
+ last_heading = heading
159
+ if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
160
+ r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
161
+ span["text"].strip()):
162
+ font_heading = True
163
  if heading_font == -1:
164
  heading_font = span["size"]
165
  elif heading_font != span["size"]:
 
180
  self.section_names = subheadings
181
  self.section_texts = section_dict
182
 
183
+
184
  def main():
185
  path = r'demo.pdf'
186
  paper = Paper(path=path)
187
  paper.parse_pdf()
188
  # for key, value in paper.section_text_dict.items():
189
+ # print(key, value)
190
+ # print("*"*40)
191
+
192
 
193
  if __name__ == '__main__':
194
  main()