File size: 6,371 Bytes
28c2a3d 9b20476 28c2a3d 9b20476 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
from base_class import AbstractPDFParser
import pickle
from scipdf_utils import parse_pdf_to_dict
class GrobidSciPDFPaser(AbstractPDFParser):
# import pysbd
# seg_en = pysbd.Segmenter(language="en", clean=False)
# seg_chinese = pysbd.Segmenter(language="zh", clean=False)
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
"""Initialize the PDF parser
Args:
pdf_link: link to the PDF file, the pdf link can be a web link or local file path
metadata: metadata of the PDF file, like authors, title, abstract, etc.
paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together
split_paragraphs: dict of section name and corresponding list of split paragraphs
"""
super().__init__(db_name=db_name)
self.db_name = db_name
self.pdf_link = pdf_link
self.pdf = None
self.metadata = {}
self.flattn_paragraphs = None
self.split_paragraphs = None
self.short_thereshold = short_thereshold
self.parse_pdf()
def _contact_too_short_paragraphs(self, ):
"""Contact too short paragraphs or discard them"""
for i, section in enumerate(self.split_paragraphs):
# section_name = section['heading']
paragraphs = section['texts']
new_paragraphs = []
for paragraph in paragraphs:
if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0:
if len(new_paragraphs) != 0:
new_paragraphs[-1] += paragraph
else:
new_paragraphs.append(paragraph)
else:
new_paragraphs.append(paragraph)
self.split_paragraphs[i]['texts'] = new_paragraphs
@staticmethod
def _find_largest_font_string(file_name, search_string):
search_string = search_string.strip()
max_font_size = -1
page_number = -1
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
try:
with open(file_name, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for index, page_layout in enumerate(extract_pages(file_name)):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if search_string in text_line.get_text():
for character in text_line:
if isinstance(character, LTChar):
if character.size > max_font_size:
max_font_size = character.size
page_number = index
return page_number + 1 if page_number != -1 else -1
except Exception as e:
return -1
def _find_section_page(self, section_name) -> None:
return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name)
def _retrive_or_parse(self, ):
"""Return pdf dict from cache if present, otherwise parse the pdf"""
db_name = self.db_name
if (self.pdf_link, db_name) not in self.db_cache.keys():
self.db_cache[(self.pdf_link, db_name)
] = parse_pdf_to_dict(self.pdf_link)
with open(self.db_cache_path, "wb") as db_cache_file:
pickle.dump(self.db_cache, db_cache_file)
return self.db_cache[(self.pdf_link, db_name)]
@staticmethod
def _check_chinese(text) -> None:
return any(u'\u4e00' <= char <= u'\u9fff' for char in text)
def parse_pdf(self) -> None:
"""Parse the PDF file
"""
article_dict = self._retrive_or_parse()
self.article_dict = article_dict
self._get_metadata()
self.split_paragraphs = self.get_split_paragraphs()
self._contact_too_short_paragraphs()
self.flattn_paragraphs = self.get_paragraphs()
def get_paragraphs(self) -> None:
"""Get the paragraphs of the PDF file
"""
paragraphs = []
self.content2section = {}
for section in self.split_paragraphs:
# paragraphs+=[section["heading"]]
paragraphs += section["texts"]
for para in section["texts"]:
self.content2section[para] = section["heading"]
return paragraphs
def _get_metadata(self) -> None:
for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]:
self.metadata[meta] = self.article_dict[meta]
self.section_names = [section["heading"]
for section in self.article_dict['sections']]
self.section_names2page = {}
for section_name in self.section_names:
section_page_index = self._find_section_page(section_name)
self.section_names2page.update({section_name: section_page_index})
self.section_names_with_page_index = [section_name + " (Page {})".format(
self.section_names2page[section_name]) for section_name in self.section_names]
def get_split_paragraphs(self, ) -> None:
section_pair_list = []
for section in self.article_dict['sections']:
section_pair_list.append({
"heading": section["heading"],
"texts": section["all_paragraphs"],
})
return section_pair_list
# @staticmethod
# def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
# """
# split based on the some magic rules
# """
# import pysbd
# for section_pair in section_pair_list:
# if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
# seg = GrobidSciPDFPaser.seg_chinese
# else:
# seg = GrobidSciPDFPaser.seg_en
# section_pair["texts"] = seg.segment(section_pair["texts"])
# section_pair["texts"] = [
# para for para in section_pair["text"] if len(para) > 2]
# return section_pair_list
|