Spaces:
Build error
Build error
from base_class import AbstractPDFParser | |
import pickle | |
from scipdf_utils import parse_pdf_to_dict | |
class GrobidSciPDFPaser(AbstractPDFParser): | |
# import pysbd | |
# seg_en = pysbd.Segmenter(language="en", clean=False) | |
# seg_chinese = pysbd.Segmenter(language="zh", clean=False) | |
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None: | |
"""Initialize the PDF parser | |
Args: | |
pdf_link: link to the PDF file, the pdf link can be a web link or local file path | |
metadata: metadata of the PDF file, like authors, title, abstract, etc. | |
paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together | |
split_paragraphs: dict of section name and corresponding list of split paragraphs | |
""" | |
super().__init__(db_name=db_name) | |
self.db_name = db_name | |
self.pdf_link = pdf_link | |
self.pdf = None | |
self.metadata = {} | |
self.flattn_paragraphs = None | |
self.split_paragraphs = None | |
self.short_thereshold = short_thereshold | |
self.parse_pdf() | |
def _contact_too_short_paragraphs(self, ): | |
"""Contact too short paragraphs or discard them""" | |
for i, section in enumerate(self.split_paragraphs): | |
# section_name = section['heading'] | |
paragraphs = section['texts'] | |
new_paragraphs = [] | |
for paragraph in paragraphs: | |
if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0: | |
if len(new_paragraphs) != 0: | |
new_paragraphs[-1] += paragraph | |
else: | |
new_paragraphs.append(paragraph) | |
else: | |
new_paragraphs.append(paragraph) | |
self.split_paragraphs[i]['texts'] = new_paragraphs | |
def _find_largest_font_string(file_name, search_string): | |
search_string = search_string.strip() | |
max_font_size = -1 | |
page_number = -1 | |
import PyPDF2 | |
from pdfminer.high_level import extract_pages | |
from pdfminer.layout import LTTextContainer, LTChar | |
try: | |
with open(file_name, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
for index, page_layout in enumerate(extract_pages(file_name)): | |
for element in page_layout: | |
if isinstance(element, LTTextContainer): | |
for text_line in element: | |
if search_string in text_line.get_text(): | |
for character in text_line: | |
if isinstance(character, LTChar): | |
if character.size > max_font_size: | |
max_font_size = character.size | |
page_number = index | |
return page_number + 1 if page_number != -1 else -1 | |
except Exception as e: | |
return -1 | |
def _find_section_page(self, section_name) -> None: | |
return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name) | |
def _retrive_or_parse(self, ): | |
"""Return pdf dict from cache if present, otherwise parse the pdf""" | |
db_name = self.db_name | |
if (self.pdf_link, db_name) not in self.db_cache.keys(): | |
self.db_cache[(self.pdf_link, db_name) | |
] = parse_pdf_to_dict(self.pdf_link) | |
with open(self.db_cache_path, "wb") as db_cache_file: | |
pickle.dump(self.db_cache, db_cache_file) | |
return self.db_cache[(self.pdf_link, db_name)] | |
def _check_chinese(text) -> None: | |
return any(u'\u4e00' <= char <= u'\u9fff' for char in text) | |
def parse_pdf(self) -> None: | |
"""Parse the PDF file | |
""" | |
article_dict = self._retrive_or_parse() | |
self.article_dict = article_dict | |
self._get_metadata() | |
self.split_paragraphs = self.get_split_paragraphs() | |
self._contact_too_short_paragraphs() | |
self.flattn_paragraphs = self.get_paragraphs() | |
def get_paragraphs(self) -> None: | |
"""Get the paragraphs of the PDF file | |
""" | |
paragraphs = [] | |
self.content2section = {} | |
for section in self.split_paragraphs: | |
# paragraphs+=[section["heading"]] | |
paragraphs += section["texts"] | |
for para in section["texts"]: | |
self.content2section[para] = section["heading"] | |
return paragraphs | |
def _get_metadata(self) -> None: | |
for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]: | |
self.metadata[meta] = self.article_dict[meta] | |
self.section_names = [section["heading"] | |
for section in self.article_dict['sections']] | |
self.section_names2page = {} | |
for section_name in self.section_names: | |
section_page_index = self._find_section_page(section_name) | |
self.section_names2page.update({section_name: section_page_index}) | |
self.section_names_with_page_index = [section_name + " (Page {})".format( | |
self.section_names2page[section_name]) for section_name in self.section_names] | |
def get_split_paragraphs(self, ) -> None: | |
section_pair_list = [] | |
for section in self.article_dict['sections']: | |
section_pair_list.append({ | |
"heading": section["heading"], | |
"texts": section["all_paragraphs"], | |
}) | |
return section_pair_list | |
# @staticmethod | |
# def _determine_optimal_split_of_pargraphs(section_pair_list) -> None: | |
# """ | |
# split based on the some magic rules | |
# """ | |
# import pysbd | |
# for section_pair in section_pair_list: | |
# if GrobidSciPDFPaser._check_chinese(section_pair["text"]): | |
# seg = GrobidSciPDFPaser.seg_chinese | |
# else: | |
# seg = GrobidSciPDFPaser.seg_en | |
# section_pair["texts"] = seg.segment(section_pair["texts"]) | |
# section_pair["texts"] = [ | |
# para for para in section_pair["text"] if len(para) > 2] | |
# return section_pair_list | |