ChatPaper111

Build error

App Files Files Community

ChatPaper111 / pdf_parser.py

johnyang

Duplicate from yixin6178/ChatPaper

3356bf0 almost 2 years ago

raw

history blame contribute delete

6.37 kB

	from base_class import AbstractPDFParser
	import pickle
	from scipdf_utils import parse_pdf_to_dict


	class GrobidSciPDFPaser(AbstractPDFParser):
	# import pysbd
	# seg_en = pysbd.Segmenter(language="en", clean=False)
	# seg_chinese = pysbd.Segmenter(language="zh", clean=False)

	def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
	"""Initialize the PDF parser

	Args:
	pdf_link: link to the PDF file, the pdf link can be a web link or local file path
	metadata: metadata of the PDF file, like authors, title, abstract, etc.
	paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together
	split_paragraphs: dict of section name and corresponding list of split paragraphs
	"""
	super().__init__(db_name=db_name)
	self.db_name = db_name
	self.pdf_link = pdf_link
	self.pdf = None
	self.metadata = {}
	self.flattn_paragraphs = None
	self.split_paragraphs = None
	self.short_thereshold = short_thereshold
	self.parse_pdf()

	def _contact_too_short_paragraphs(self, ):
	"""Contact too short paragraphs or discard them"""
	for i, section in enumerate(self.split_paragraphs):
	# section_name = section['heading']
	paragraphs = section['texts']
	new_paragraphs = []
	for paragraph in paragraphs:
	if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0:
	if len(new_paragraphs) != 0:
	new_paragraphs[-1] += paragraph
	else:
	new_paragraphs.append(paragraph)
	else:
	new_paragraphs.append(paragraph)
	self.split_paragraphs[i]['texts'] = new_paragraphs

	@staticmethod
	def _find_largest_font_string(file_name, search_string):
	search_string = search_string.strip()
	max_font_size = -1
	page_number = -1
	import PyPDF2
	from pdfminer.high_level import extract_pages
	from pdfminer.layout import LTTextContainer, LTChar
	try:
	with open(file_name, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)

	for index, page_layout in enumerate(extract_pages(file_name)):
	for element in page_layout:
	if isinstance(element, LTTextContainer):
	for text_line in element:
	if search_string in text_line.get_text():
	for character in text_line:
	if isinstance(character, LTChar):
	if character.size > max_font_size:
	max_font_size = character.size
	page_number = index
	return page_number + 1 if page_number != -1 else -1
	except Exception as e:
	return -1


	def _find_section_page(self, section_name) -> None:
	return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name)

	def _retrive_or_parse(self, ):
	"""Return pdf dict from cache if present, otherwise parse the pdf"""
	db_name = self.db_name
	if (self.pdf_link, db_name) not in self.db_cache.keys():
	self.db_cache[(self.pdf_link, db_name)
	] = parse_pdf_to_dict(self.pdf_link)
	with open(self.db_cache_path, "wb") as db_cache_file:
	pickle.dump(self.db_cache, db_cache_file)
	return self.db_cache[(self.pdf_link, db_name)]

	@staticmethod
	def _check_chinese(text) -> None:
	return any(u'\u4e00' <= char <= u'\u9fff' for char in text)

	def parse_pdf(self) -> None:
	"""Parse the PDF file
	"""
	article_dict = self._retrive_or_parse()
	self.article_dict = article_dict
	self._get_metadata()
	self.split_paragraphs = self.get_split_paragraphs()
	self._contact_too_short_paragraphs()

	self.flattn_paragraphs = self.get_paragraphs()

	def get_paragraphs(self) -> None:
	"""Get the paragraphs of the PDF file
	"""
	paragraphs = []
	self.content2section = {}
	for section in self.split_paragraphs:
	# paragraphs+=[section["heading"]]
	paragraphs += section["texts"]
	for para in section["texts"]:
	self.content2section[para] = section["heading"]
	return paragraphs

	def _get_metadata(self) -> None:
	for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]:
	self.metadata[meta] = self.article_dict[meta]
	self.section_names = [section["heading"]
	for section in self.article_dict['sections']]
	self.section_names2page = {}
	for section_name in self.section_names:
	section_page_index = self._find_section_page(section_name)
	self.section_names2page.update({section_name: section_page_index})
	self.section_names_with_page_index = [section_name + " (Page {})".format(
	self.section_names2page[section_name]) for section_name in self.section_names]

	def get_split_paragraphs(self, ) -> None:
	section_pair_list = []
	for section in self.article_dict['sections']:
	section_pair_list.append({
	"heading": section["heading"],
	"texts": section["all_paragraphs"],
	})
	return section_pair_list

	# @staticmethod
	# def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
	# """
	# split based on the some magic rules
	# """
	# import pysbd
	# for section_pair in section_pair_list:
	# if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
	# seg = GrobidSciPDFPaser.seg_chinese
	# else:
	# seg = GrobidSciPDFPaser.seg_en
	# section_pair["texts"] = seg.segment(section_pair["texts"])
	# section_pair["texts"] = [
	# para for para in section_pair["text"] if len(para) > 2]
	# return section_pair_list