wangrongsheng commited on
Commit
f64978e
1 Parent(s): 3668f92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -41
app.py CHANGED
@@ -59,14 +59,14 @@ class Paper:
59
  self.sl = sl
60
  self.section_names = [] # 段落标题
61
  self.section_texts = {} # 段落内容
 
62
  if title == '':
63
  self.pdf = fitz.open(self.path) # pdf文档
64
  self.title = self.get_title()
65
  self.parse_pdf()
66
  else:
67
  self.title = title
68
- self.authers = authers
69
- self.abs = abs
70
  self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
71
  self.digit_num = [str(d+1) for d in range(10)]
72
  self.first_image = ''
@@ -167,12 +167,13 @@ class Paper:
167
  text = page.get_text("dict") # 获取页面上的文本信息
168
  blocks = text["blocks"] # 获取文本块列表
169
  for block in blocks: # 遍历每个文本块
170
- if block["type"] == 0: # 如果是文字类型
171
- font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
172
- max_font_sizes.append(font_size)
173
- if font_size > max_font_size: # 如果字体大小大于当前最大值
174
- max_font_size = font_size # 更新最大值
175
- max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
 
176
  max_font_sizes.sort()
177
  print("max_font_sizes", max_font_sizes[-10:])
178
  cur_title = ''
@@ -180,19 +181,20 @@ class Paper:
180
  text = page.get_text("dict") # 获取页面上的文本信息
181
  blocks = text["blocks"] # 获取文本块列表
182
  for block in blocks: # 遍历每个文本块
183
- if block["type"] == 0: # 如果是文字类型
184
- cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
185
- font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
186
- font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
187
- # print(font_size)
188
- if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
189
- # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
190
- if len(cur_string) > 4 and "arXiv" not in cur_string:
191
- # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
192
- if cur_title == '' :
193
- cur_title += cur_string
194
- else:
195
- cur_title += ' ' + cur_string
 
196
  # break
197
  title = cur_title.replace('\n', ' ')
198
  return title
@@ -232,30 +234,12 @@ class Paper:
232
  text = ''
233
  text_list = []
234
  section_dict = {}
235
-
236
- # # 先处理Abstract章节
237
- # for page_index, page in enumerate(self.pdf):
238
- # cur_text = page.get_text()
239
- # # 如果该页面是Abstract章节所在页面
240
- # if page_index == list(self.section_page_dict.values())[0]:
241
- # abs_str = "Abstract"
242
- # # 获取Abstract章节的起始位置
243
- # first_index = cur_text.find(abs_str)
244
- # # 查找下一个章节的关键词,这里是Introduction
245
- # intro_str = "Introduction"
246
- # if intro_str in cur_text:
247
- # second_index = cur_text.find(intro_str)
248
- # elif intro_str.upper() in cur_text:
249
- # second_index = cur_text.find(intro_str.upper())
250
- # # 将Abstract章节内容加入字典中
251
- # section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
252
- # '').replace('\n', ' ').split('I.')[0].split("II.")[0]
253
-
254
  # 再处理其他章节:
255
  text_list = [page.get_text() for page in self.pdf]
256
  for sec_index, sec_name in enumerate(self.section_page_dict):
257
  print(sec_index, sec_name, self.section_page_dict[sec_name])
258
- if sec_index <= 0:
259
  continue
260
  else:
261
  # 直接考虑后面的内容:
 
59
  self.sl = sl
60
  self.section_names = [] # 段落标题
61
  self.section_texts = {} # 段落内容
62
+ self.abs = abs
63
  if title == '':
64
  self.pdf = fitz.open(self.path) # pdf文档
65
  self.title = self.get_title()
66
  self.parse_pdf()
67
  else:
68
  self.title = title
69
+ self.authers = authers
 
70
  self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
71
  self.digit_num = [str(d+1) for d in range(10)]
72
  self.first_image = ''
 
167
  text = page.get_text("dict") # 获取页面上的文本信息
168
  blocks = text["blocks"] # 获取文本块列表
169
  for block in blocks: # 遍历每个文本块
170
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
171
+ if len(block["lines"][0]["spans"]):
172
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
173
+ max_font_sizes.append(font_size)
174
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
175
+ max_font_size = font_size # 更新最大值
176
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
177
  max_font_sizes.sort()
178
  print("max_font_sizes", max_font_sizes[-10:])
179
  cur_title = ''
 
181
  text = page.get_text("dict") # 获取页面上的文本信息
182
  blocks = text["blocks"] # 获取文本块列表
183
  for block in blocks: # 遍历每个文本块
184
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
185
+ if len(block["lines"][0]["spans"]):
186
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
187
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
188
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
189
+ # print(font_size)
190
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
191
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
192
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
193
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
194
+ if cur_title == '' :
195
+ cur_title += cur_string
196
+ else:
197
+ cur_title += ' ' + cur_string
198
  # break
199
  title = cur_title.replace('\n', ' ')
200
  return title
 
234
  text = ''
235
  text_list = []
236
  section_dict = {}
237
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  # 再处理其他章节:
239
  text_list = [page.get_text() for page in self.pdf]
240
  for sec_index, sec_name in enumerate(self.section_page_dict):
241
  print(sec_index, sec_name, self.section_page_dict[sec_name])
242
+ if sec_index <= 0 and self.abs:
243
  continue
244
  else:
245
  # 直接考虑后面的内容: