import fitz, io, os from PIL import Image from collections import Counter import json import re class Paper: def __init__(self, path, title='', url='', abs='', authors=[]): # 初始化函数,根据pdf路径初始化Paper对象 self.url = url # 文章链接 self.path = path # pdf路径 self.section_names = [] # 段落标题 self.section_texts = {} # 段落内容 self.abs = abs self.title_page = 0 if title == '': self.pdf = fitz.open(self.path) # pdf文档 self.title = self.get_title() self.parse_pdf() else: self.title = title self.authors = authors self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"] self.digit_num = [str(d + 1) for d in range(10)] self.first_image = '' def parse_pdf(self): self.pdf = fitz.open(self.path) # pdf文档 self.text_list = [page.get_text() for page in self.pdf] self.all_text = ' '.join(self.text_list) self.extract_section_infomation() self.section_texts.update({"title": self.title}) self.pdf.close() # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表 def get_chapter_names(self, ): # # 打开一个pdf文件 doc = fitz.open(self.path) # pdf文档 text_list = [page.get_text() for page in doc] all_text = '' for text in text_list: all_text += text # # 创建一个空列表,用于存储章节名称 chapter_names = [] for line in all_text.split('\n'): line_list = line.split(' ') if '.' in line: point_split_list = line.split('.') space_split_list = line.split(' ') if 1 < len(space_split_list) < 5: if 1 < len(point_split_list) < 5 and ( point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num): # print("line:", line) chapter_names.append(line) return chapter_names def get_title(self): doc = self.pdf # 打开pdf文件 max_font_size = 0 # 初始化最大字体大小为0 max_string = "" # 初始化最大字体大小对应的字符串为空 max_font_sizes = [0] for page_index, page in enumerate(doc): # 遍历每一页 text = page.get_text("dict") # 获取页面上的文本信息 blocks = text["blocks"] # 获取文本块列表 for block in blocks: # 遍历每个文本块 if block["type"] == 0 and len(block['lines']): # 如果是文字类型 if len(block["lines"][0]["spans"]): font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小 max_font_sizes.append(font_size) if font_size > max_font_size: # 如果字体大小大于当前最大值 max_font_size = font_size # 更新最大值 max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串 max_font_sizes.sort() # print("max_font_sizes", max_font_sizes[-10:]) cur_title = '' for page_index, page in enumerate(doc): # 遍历每一页 text = page.get_text("dict") # 获取页面上的文本信息 blocks = text["blocks"] # 获取文本块列表 for block in blocks: # 遍历每个文本块 if block["type"] == 0 and len(block['lines']): # 如果是文字类型 if len(block["lines"][0]["spans"]): cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串 font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征 font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小 # print(font_size) if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3: # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags) if len(cur_string) > 4 and "arXiv" not in cur_string: # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags) if cur_title == '': cur_title += cur_string else: cur_title += ' ' + cur_string self.title_page = page_index # break title = cur_title.replace('\n', ' ') return title def extract_section_infomation(self): doc = fitz.open(self.path) # 获取文档中所有字体大小 font_sizes = [] for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if 'lines' not in block: continue lines = block["lines"] for line in lines: for span in line["spans"]: font_sizes.append(span["size"]) most_common_size, _ = Counter(font_sizes).most_common(1)[0] # 按照最频繁的字体大小确定标题字体大小的阈值 threshold = most_common_size * 1 section_dict = {} last_heading = None subheadings = [] heading_font = -1 # 遍历每一页并查找子标题 found_abstract = False upper_heading = False font_heading = False for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if not found_abstract: try: text = json.dumps(block) except: continue if re.search(r"\bAbstract\b", text, re.IGNORECASE): found_abstract = True last_heading = "Abstract" section_dict["Abstract"] = "" if found_abstract: if 'lines' not in block: continue lines = block["lines"] for line in lines: for span in line["spans"]: # 如果当前文本是子标题 if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文 upper_heading = True heading = span["text"].strip() if "References" in heading: # reference 以后的内容不考虑 self.section_names = subheadings self.section_texts = section_dict return subheadings.append(heading) if last_heading is not None: section_dict[last_heading] = section_dict[last_heading].strip() section_dict[heading] = "" last_heading = heading if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断 r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*", span["text"].strip()): font_heading = True if heading_font == -1: heading_font = span["size"] elif heading_font != span["size"]: continue heading = span["text"].strip() if "References" in heading: # reference 以后的内容不考虑 self.section_names = subheadings self.section_texts = section_dict return subheadings.append(heading) if last_heading is not None: section_dict[last_heading] = section_dict[last_heading].strip() section_dict[heading] = "" last_heading = heading # 否则将当前文本添加到上一个子标题的文本中 elif last_heading is not None: section_dict[last_heading] += " " + span["text"].strip() self.section_names = subheadings self.section_texts = section_dict def main(): path = r'demo.pdf' paper = Paper(path=path) paper.parse_pdf() # for key, value in paper.section_text_dict.items(): # print(key, value) # print("*"*40) if __name__ == '__main__': main()