from base_class import AbstractPDFParser import pickle from scipdf_utils import parse_pdf_to_dict class GrobidSciPDFPaser(AbstractPDFParser): import pysbd seg_en = pysbd.Segmenter(language="en", clean=False) seg_chinese = pysbd.Segmenter(language="zh", clean=False) def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None: """Initialize the PDF parser Args: pdf_link: link to the PDF file, the pdf link can be a web link or local file path metadata: metadata of the PDF file, like authors, title, abstract, etc. paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together split_paragraphs: dict of section name and corresponding list of split paragraphs """ super().__init__(db_name=db_name) self.db_name = db_name self.pdf_link = pdf_link self.pdf = None self.metadata = {} self.flattn_paragraphs = None self.split_paragraphs = None self.short_thereshold = short_thereshold self.parse_pdf() def _contact_too_short_paragraphs(self, ): """Contact too short paragraphs or discard them""" for i, section in enumerate(self.split_paragraphs): # section_name = section['heading'] paragraphs = section['texts'] new_paragraphs = [] for paragraph in paragraphs: if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0: if len(new_paragraphs) != 0: new_paragraphs[-1] += paragraph else: new_paragraphs.append(paragraph) else: new_paragraphs.append(paragraph) self.split_paragraphs[i]['texts'] = new_paragraphs @staticmethod def _find_largest_font_string(file_name, search_string): search_string = search_string.strip() max_font_size = -1 page_number = -1 import PyPDF2 from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer, LTChar try: with open(file_name, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for index, page_layout in enumerate(extract_pages(file_name)): for element in page_layout: if isinstance(element, LTTextContainer): for text_line in element: if search_string in text_line.get_text(): for character in text_line: if isinstance(character, LTChar): if character.size > max_font_size: max_font_size = character.size page_number = index return page_number + 1 if page_number != -1 else -1 except Exception as e: return -1 def _find_section_page(self, section_name) -> None: return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name) def _retrive_or_parse(self, ): """Return pdf dict from cache if present, otherwise parse the pdf""" db_name = self.db_name if (self.pdf_link, db_name) not in self.db_cache.keys(): self.db_cache[(self.pdf_link, db_name) ] = parse_pdf_to_dict(self.pdf_link) with open(self.db_cache_path, "wb") as db_cache_file: pickle.dump(self.db_cache, db_cache_file) return self.db_cache[(self.pdf_link, db_name)] @staticmethod def _check_chinese(text) -> None: return any(u'\u4e00' <= char <= u'\u9fff' for char in text) def parse_pdf(self) -> None: """Parse the PDF file """ article_dict = self._retrive_or_parse() self.article_dict = article_dict self._get_metadata() self.split_paragraphs = self.get_split_paragraphs() self._contact_too_short_paragraphs() self.flattn_paragraphs = self.get_paragraphs() def get_paragraphs(self) -> None: """Get the paragraphs of the PDF file """ paragraphs = [] self.content2section = {} for section in self.split_paragraphs: # paragraphs+=[section["heading"]] paragraphs += section["texts"] for para in section["texts"]: self.content2section[para] = section["heading"] return paragraphs def _get_metadata(self) -> None: for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]: self.metadata[meta] = self.article_dict[meta] self.section_names = [section["heading"] for section in self.article_dict['sections']] self.section_names2page = {} for section_name in self.section_names: section_page_index = self._find_section_page(section_name) self.section_names2page.update({section_name: section_page_index}) self.section_names_with_page_index = [section_name + " (Page {})".format( self.section_names2page[section_name]) for section_name in self.section_names] def get_split_paragraphs(self, ) -> None: section_pair_list = [] for section in self.article_dict['sections']: section_pair_list.append({ "heading": section["heading"], "texts": section["all_paragraphs"], }) return section_pair_list @staticmethod def _determine_optimal_split_of_pargraphs(section_pair_list) -> None: """ split based on the some magic rules """ import pysbd for section_pair in section_pair_list: if GrobidSciPDFPaser._check_chinese(section_pair["text"]): seg = GrobidSciPDFPaser.seg_chinese else: seg = GrobidSciPDFPaser.seg_en section_pair["texts"] = seg.segment(section_pair["texts"]) section_pair["texts"] = [ para for para in section_pair["text"] if len(para) > 2] return section_pair_list