Spaces:
Running
Running
import fitz, io, os | |
from PIL import Image | |
from collections import Counter | |
import json | |
import re | |
class Paper: | |
def __init__(self, path, title='', url='', abs='', authors=[]): | |
# 初始化函数,根据pdf路径初始化Paper对象 | |
self.url = url # 文章链接 | |
self.path = path # pdf路径 | |
self.section_names = [] # 段落标题 | |
self.section_texts = {} # 段落内容 | |
self.abs = abs | |
self.title_page = 0 | |
if title == '': | |
self.pdf = fitz.open(self.path) # pdf文档 | |
self.title = self.get_title() | |
self.parse_pdf() | |
else: | |
self.title = title | |
self.authors = authors | |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"] | |
self.digit_num = [str(d + 1) for d in range(10)] | |
self.first_image = '' | |
def parse_pdf(self): | |
self.pdf = fitz.open(self.path) # pdf文档 | |
self.text_list = [page.get_text() for page in self.pdf] | |
self.all_text = ' '.join(self.text_list) | |
self.extract_section_infomation() | |
self.section_texts.update({"title": self.title}) | |
self.pdf.close() | |
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表 | |
def get_chapter_names(self, ): | |
# # 打开一个pdf文件 | |
doc = fitz.open(self.path) # pdf文档 | |
text_list = [page.get_text() for page in doc] | |
all_text = '' | |
for text in text_list: | |
all_text += text | |
# # 创建一个空列表,用于存储章节名称 | |
chapter_names = [] | |
for line in all_text.split('\n'): | |
line_list = line.split(' ') | |
if '.' in line: | |
point_split_list = line.split('.') | |
space_split_list = line.split(' ') | |
if 1 < len(space_split_list) < 5: | |
if 1 < len(point_split_list) < 5 and ( | |
point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num): | |
# print("line:", line) | |
chapter_names.append(line) | |
return chapter_names | |
def get_title(self): | |
doc = self.pdf # 打开pdf文件 | |
max_font_size = 0 # 初始化最大字体大小为0 | |
max_string = "" # 初始化最大字体大小对应的字符串为空 | |
max_font_sizes = [0] | |
for page_index, page in enumerate(doc): # 遍历每一页 | |
text = page.get_text("dict") # 获取页面上的文本信息 | |
blocks = text["blocks"] # 获取文本块列表 | |
for block in blocks: # 遍历每个文本块 | |
if block["type"] == 0 and len(block['lines']): # 如果是文字类型 | |
if len(block["lines"][0]["spans"]): | |
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小 | |
max_font_sizes.append(font_size) | |
if font_size > max_font_size: # 如果字体大小大于当前最大值 | |
max_font_size = font_size # 更新最大值 | |
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串 | |
max_font_sizes.sort() | |
# print("max_font_sizes", max_font_sizes[-10:]) | |
cur_title = '' | |
for page_index, page in enumerate(doc): # 遍历每一页 | |
text = page.get_text("dict") # 获取页面上的文本信息 | |
blocks = text["blocks"] # 获取文本块列表 | |
for block in blocks: # 遍历每个文本块 | |
if block["type"] == 0 and len(block['lines']): # 如果是文字类型 | |
if len(block["lines"][0]["spans"]): | |
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串 | |
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征 | |
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小 | |
# print(font_size) | |
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3: | |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags) | |
if len(cur_string) > 4 and "arXiv" not in cur_string: | |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags) | |
if cur_title == '': | |
cur_title += cur_string | |
else: | |
cur_title += ' ' + cur_string | |
self.title_page = page_index | |
# break | |
title = cur_title.replace('\n', ' ') | |
return title | |
def extract_section_infomation(self): | |
doc = fitz.open(self.path) | |
# 获取文档中所有字体大小 | |
font_sizes = [] | |
for page in doc: | |
blocks = page.get_text("dict")["blocks"] | |
for block in blocks: | |
if 'lines' not in block: | |
continue | |
lines = block["lines"] | |
for line in lines: | |
for span in line["spans"]: | |
font_sizes.append(span["size"]) | |
most_common_size, _ = Counter(font_sizes).most_common(1)[0] | |
# 按照最频繁的字体大小确定标题字体大小的阈值 | |
threshold = most_common_size * 1 | |
section_dict = {} | |
last_heading = None | |
subheadings = [] | |
heading_font = -1 | |
# 遍历每一页并查找子标题 | |
found_abstract = False | |
upper_heading = False | |
font_heading = False | |
for page in doc: | |
blocks = page.get_text("dict")["blocks"] | |
for block in blocks: | |
if not found_abstract: | |
try: | |
text = json.dumps(block) | |
except: | |
continue | |
if re.search(r"\bAbstract\b", text, re.IGNORECASE): | |
found_abstract = True | |
last_heading = "Abstract" | |
section_dict["Abstract"] = "" | |
if found_abstract: | |
if 'lines' not in block: | |
continue | |
lines = block["lines"] | |
for line in lines: | |
for span in line["spans"]: | |
# 如果当前文本是子标题 | |
if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文 | |
upper_heading = True | |
heading = span["text"].strip() | |
if "References" in heading: # reference 以后的内容不考虑 | |
self.section_names = subheadings | |
self.section_texts = section_dict | |
return | |
subheadings.append(heading) | |
if last_heading is not None: | |
section_dict[last_heading] = section_dict[last_heading].strip() | |
section_dict[heading] = "" | |
last_heading = heading | |
if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断 | |
r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*", | |
span["text"].strip()): | |
font_heading = True | |
if heading_font == -1: | |
heading_font = span["size"] | |
elif heading_font != span["size"]: | |
continue | |
heading = span["text"].strip() | |
if "References" in heading: # reference 以后的内容不考虑 | |
self.section_names = subheadings | |
self.section_texts = section_dict | |
return | |
subheadings.append(heading) | |
if last_heading is not None: | |
section_dict[last_heading] = section_dict[last_heading].strip() | |
section_dict[heading] = "" | |
last_heading = heading | |
# 否则将当前文本添加到上一个子标题的文本中 | |
elif last_heading is not None: | |
section_dict[last_heading] += " " + span["text"].strip() | |
self.section_names = subheadings | |
self.section_texts = section_dict | |
def main(): | |
path = r'demo.pdf' | |
paper = Paper(path=path) | |
paper.parse_pdf() | |
# for key, value in paper.section_text_dict.items(): | |
# print(key, value) | |
# print("*"*40) | |
if __name__ == '__main__': | |
main() | |