|
import fitz, io, os |
|
from PIL import Image |
|
from collections import Counter |
|
import json |
|
import re |
|
|
|
class Paper: |
|
def __init__(self, path, title='', url='', abs='', authors=[]): |
|
|
|
self.url = url |
|
self.path = path |
|
self.section_names = [] |
|
self.section_texts = {} |
|
self.abs = abs |
|
self.title_page = 0 |
|
if title == '': |
|
self.pdf = fitz.open(self.path) |
|
self.title = self.get_title() |
|
self.parse_pdf() |
|
else: |
|
self.title = title |
|
self.authors = authors |
|
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"] |
|
self.digit_num = [str(d + 1) for d in range(10)] |
|
self.first_image = '' |
|
|
|
def parse_pdf(self): |
|
self.pdf = fitz.open(self.path) |
|
self.text_list = [page.get_text() for page in self.pdf] |
|
self.all_text = ' '.join(self.text_list) |
|
self.extract_section_infomation() |
|
self.section_texts.update({"title": self.title}) |
|
self.pdf.close() |
|
|
|
|
|
def get_chapter_names(self, ): |
|
|
|
doc = fitz.open(self.path) |
|
text_list = [page.get_text() for page in doc] |
|
all_text = '' |
|
for text in text_list: |
|
all_text += text |
|
|
|
chapter_names = [] |
|
for line in all_text.split('\n'): |
|
line_list = line.split(' ') |
|
if '.' in line: |
|
point_split_list = line.split('.') |
|
space_split_list = line.split(' ') |
|
if 1 < len(space_split_list) < 5: |
|
if 1 < len(point_split_list) < 5 and ( |
|
point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num): |
|
|
|
chapter_names.append(line) |
|
|
|
return chapter_names |
|
|
|
def get_title(self): |
|
doc = self.pdf |
|
max_font_size = 0 |
|
max_string = "" |
|
max_font_sizes = [0] |
|
for page_index, page in enumerate(doc): |
|
text = page.get_text("dict") |
|
blocks = text["blocks"] |
|
for block in blocks: |
|
if block["type"] == 0 and len(block['lines']): |
|
if len(block["lines"][0]["spans"]): |
|
font_size = block["lines"][0]["spans"][0]["size"] |
|
max_font_sizes.append(font_size) |
|
if font_size > max_font_size: |
|
max_font_size = font_size |
|
max_string = block["lines"][0]["spans"][0]["text"] |
|
max_font_sizes.sort() |
|
|
|
cur_title = '' |
|
for page_index, page in enumerate(doc): |
|
text = page.get_text("dict") |
|
blocks = text["blocks"] |
|
for block in blocks: |
|
if block["type"] == 0 and len(block['lines']): |
|
if len(block["lines"][0]["spans"]): |
|
cur_string = block["lines"][0]["spans"][0]["text"] |
|
font_flags = block["lines"][0]["spans"][0]["flags"] |
|
font_size = block["lines"][0]["spans"][0]["size"] |
|
|
|
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3: |
|
|
|
if len(cur_string) > 4 and "arXiv" not in cur_string: |
|
|
|
if cur_title == '': |
|
cur_title += cur_string |
|
else: |
|
cur_title += ' ' + cur_string |
|
self.title_page = page_index |
|
|
|
title = cur_title.replace('\n', ' ') |
|
return title |
|
|
|
def extract_section_infomation(self): |
|
doc = fitz.open(self.path) |
|
|
|
|
|
font_sizes = [] |
|
for page in doc: |
|
blocks = page.get_text("dict")["blocks"] |
|
for block in blocks: |
|
if 'lines' not in block: |
|
continue |
|
lines = block["lines"] |
|
for line in lines: |
|
for span in line["spans"]: |
|
font_sizes.append(span["size"]) |
|
most_common_size, _ = Counter(font_sizes).most_common(1)[0] |
|
|
|
|
|
threshold = most_common_size * 1 |
|
|
|
section_dict = {} |
|
last_heading = None |
|
subheadings = [] |
|
heading_font = -1 |
|
|
|
found_abstract = False |
|
upper_heading = False |
|
font_heading = False |
|
for page in doc: |
|
blocks = page.get_text("dict")["blocks"] |
|
for block in blocks: |
|
if not found_abstract: |
|
try: |
|
text = json.dumps(block) |
|
except: |
|
continue |
|
if re.search(r"\bAbstract\b", text, re.IGNORECASE): |
|
found_abstract = True |
|
last_heading = "Abstract" |
|
section_dict["Abstract"] = "" |
|
if found_abstract: |
|
if 'lines' not in block: |
|
continue |
|
lines = block["lines"] |
|
for line in lines: |
|
for span in line["spans"]: |
|
|
|
if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: |
|
upper_heading = True |
|
heading = span["text"].strip() |
|
if "References" in heading: |
|
self.section_names = subheadings |
|
self.section_texts = section_dict |
|
return |
|
subheadings.append(heading) |
|
if last_heading is not None: |
|
section_dict[last_heading] = section_dict[last_heading].strip() |
|
section_dict[heading] = "" |
|
last_heading = heading |
|
if not upper_heading and span["size"] > threshold and re.match( |
|
r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*", |
|
span["text"].strip()): |
|
font_heading = True |
|
if heading_font == -1: |
|
heading_font = span["size"] |
|
elif heading_font != span["size"]: |
|
continue |
|
heading = span["text"].strip() |
|
if "References" in heading: |
|
self.section_names = subheadings |
|
self.section_texts = section_dict |
|
return |
|
subheadings.append(heading) |
|
if last_heading is not None: |
|
section_dict[last_heading] = section_dict[last_heading].strip() |
|
section_dict[heading] = "" |
|
last_heading = heading |
|
|
|
elif last_heading is not None: |
|
section_dict[last_heading] += " " + span["text"].strip() |
|
self.section_names = subheadings |
|
self.section_texts = section_dict |
|
|
|
|
|
def main(): |
|
path = r'demo.pdf' |
|
paper = Paper(path=path) |
|
paper.parse_pdf() |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|