Spaces:

yixin6178
/

ChatPaper

Build error

App Files Files Community

ChatPaper / scipdf_utils.py

yiyixin

upload

28c2a3d almost 2 years ago

raw

history blame

14.5 kB

	import re
	import os
	import os.path as op
	from glob import glob
	import urllib
	import subprocess
	import requests
	from bs4 import BeautifulSoup, NavigableString


	# or https://cloud.science-miner.com/grobid/ for cloud service
	GROBID_URL = "http://localhost:8070"
	DIR_PATH = op.dirname(op.abspath(__file__))
	PDF_FIGURES_JAR_PATH = op.join(
	DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar"
	)


	def list_pdf_paths(pdf_folder: str):
	"""
	list of pdf paths in pdf folder
	"""
	return glob(op.join(pdf_folder, "", "", "*.pdf"))


	def validate_url(path: str):
	"""
	Validate a given ``path`` if it is URL or not
	"""
	regex = re.compile(
	r"^(?:http\|ftp)s?://" # http:// or https://
	# domain...
	r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|"
	r"localhost\|" # localhost...
	r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
	r"(?::\d+)?" # optional port
	r"(?:/?\|[/?]\S+)$",
	re.IGNORECASE,
	)
	return re.match(regex, path) is not None


	def parse_pdf(
	pdf_path: str,
	fulltext: bool = True,
	soup: bool = False,
	grobid_url: str = GROBID_URL,
	):
	"""
	Function to parse PDF to XML or BeautifulSoup using GROBID tool

	You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
	After loading GROBID zip file, you can run GROBID by using the following
	>> ./gradlew run

	Parameters
	==========
	pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
	fulltext: bool, option for parsing, if True, parse full text of the article
	if False, parse only header
	grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
	This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
	soup: bool, if True, return BeautifulSoup of the article

	Output
	======
	parsed_article: if soup is False, return parsed XML in text format,
	else return BeautifulSoup of the XML
	Example
	=======
	>> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
	"""
	# GROBID URL
	if fulltext:
	url = "%s/api/processFulltextDocument" % grobid_url
	else:
	url = "%s/api/processHeaderDocument" % grobid_url

	if isinstance(pdf_path, str):
	if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf":
	print("The input URL has to end with ``.pdf``")
	parsed_article = None
	elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf":
	page = urllib.request.urlopen(pdf_path).read()
	parsed_article = requests.post(url, files={"input": page}).text
	elif op.exists(pdf_path):
	parsed_article = requests.post(
	url, files={"input": open(pdf_path, "rb")}
	).text
	else:
	parsed_article = None
	elif isinstance(pdf_path, bytes):
	# assume that incoming is byte string
	parsed_article = requests.post(url, files={"input": pdf_path}).text
	else:
	parsed_article = None

	if soup and parsed_article is not None:
	parsed_article = BeautifulSoup(parsed_article, "lxml")
	return parsed_article


	def parse_authors(article):
	"""
	Parse authors from a given BeautifulSoup of an article
	"""
	author_names = article.find("sourcedesc").findAll("persname")
	authors = []
	for author in author_names:
	firstname = author.find("forename", {"type": "first"})
	firstname = firstname.text.strip() if firstname is not None else ""
	middlename = author.find("forename", {"type": "middle"})
	middlename = middlename.text.strip() if middlename is not None else ""
	lastname = author.find("surname")
	lastname = lastname.text.strip() if lastname is not None else ""
	if middlename != "":
	authors.append(firstname + " " + middlename + " " + lastname)
	else:
	authors.append(firstname + " " + lastname)
	authors = "; ".join(authors)
	return authors


	def parse_date(article):
	"""
	Parse date from a given BeautifulSoup of an article
	"""
	pub_date = article.find("publicationstmt")
	year = pub_date.find("date")
	year = year.attrs.get("when") if year is not None else ""
	return year


	def parse_abstract(article):
	"""
	Parse abstract from a given BeautifulSoup of an article
	"""
	div = article.find("abstract")
	abstract = ""
	for p in list(div.children):
	if not isinstance(p, NavigableString) and len(list(p)) > 0:
	abstract += " ".join(
	[elem.text for elem in p if not isinstance(
	elem, NavigableString)]
	)
	return abstract


	def calculate_number_of_references(div):
	"""
	For a given section, calculate number of references made in the section
	"""
	n_publication_ref = len(
	[ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"]
	)
	n_figure_ref = len(
	[ref for ref in div.find_all(
	"ref") if ref.attrs.get("type") == "figure"]
	)
	return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref}


	def parse_sections(article, as_list: bool = False):
	"""
	Parse list of sections from a given BeautifulSoup of an article

	Parameters
	==========
	as_list: bool, if True, output text as a list of paragraph instead
	of joining it together as one single text
	"""
	article_text = article.find("text")
	divs = article_text.find_all(
	"div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
	sections = []
	for div in divs:
	div_list = list(div.children)
	if len(div_list) == 0:
	heading = ""
	text = ""
	all_paragraphs = []
	elif len(div_list) == 1:
	if isinstance(div_list[0], NavigableString):
	heading = str(div_list[0])
	text = ""
	all_paragraphs = []
	else:
	heading = ""
	text = div_list[0].text
	all_paragraphs = [text]
	else:
	text = []
	heading = div_list[0]
	all_paragraphs = []
	if isinstance(heading, NavigableString):
	heading = str(heading)
	p_all = list(div.children)[1:]
	else:
	heading = ""
	p_all = list(div.children)
	for p in p_all:
	if p is not None:
	try:
	text.append(p.text)
	all_paragraphs.append(p.text)
	except:
	pass
	if not as_list:
	text = "\n".join(text)
	if heading != "" or text != "":
	ref_dict = calculate_number_of_references(div)
	sections.append(
	{
	"heading": heading,
	"text": text,
	"all_paragraphs": all_paragraphs,
	"n_publication_ref": ref_dict["n_publication_ref"],
	"n_figure_ref": ref_dict["n_figure_ref"],
	}
	)
	return sections


	def parse_references(article):
	"""
	Parse list of references from a given BeautifulSoup of an article
	"""
	reference_list = []
	references = article.find("text").find("div", attrs={"type": "references"})
	references = references.find_all(
	"biblstruct") if references is not None else []
	reference_list = []
	for reference in references:
	title = reference.find("title", attrs={"level": "a"})
	if title is None:
	title = reference.find("title", attrs={"level": "m"})
	title = title.text if title is not None else ""
	journal = reference.find("title", attrs={"level": "j"})
	journal = journal.text if journal is not None else ""
	if journal == "":
	journal = reference.find("publisher")
	journal = journal.text if journal is not None else ""
	year = reference.find("date")
	year = year.attrs.get("when") if year is not None else ""
	authors = []
	for author in reference.find_all("author"):
	firstname = author.find("forename", {"type": "first"})
	firstname = firstname.text.strip() if firstname is not None else ""
	middlename = author.find("forename", {"type": "middle"})
	middlename = middlename.text.strip() if middlename is not None else ""
	lastname = author.find("surname")
	lastname = lastname.text.strip() if lastname is not None else ""
	if middlename != "":
	authors.append(firstname + " " + middlename + " " + lastname)
	else:
	authors.append(firstname + " " + lastname)
	authors = "; ".join(authors)
	reference_list.append(
	{"title": title, "journal": journal, "year": year, "authors": authors}
	)
	return reference_list


	def parse_figure_caption(article):
	"""
	Parse list of figures/tables from a given BeautifulSoup of an article
	"""
	figures_list = []
	figures = article.find_all("figure")
	for figure in figures:
	figure_type = figure.attrs.get("type") or ""
	figure_id = figure.attrs["xml:id"] or ""
	label = figure.find("label").text
	if figure_type == "table":
	caption = figure.find("figdesc").text
	data = figure.table.text
	else:
	caption = figure.text
	data = ""
	figures_list.append(
	{
	"figure_label": label,
	"figure_type": figure_type,
	"figure_id": figure_id,
	"figure_caption": caption,
	"figure_data": data,
	}
	)
	return figures_list


	def convert_article_soup_to_dict(article, as_list: bool = False):
	"""
	Function to convert BeautifulSoup to JSON format
	similar to the output from https://github.com/allenai/science-parse/

	Parameters
	==========
	article: BeautifulSoup

	Output
	======
	article_json: dict, parsed dictionary of a given article in the following format
	{
	'title': ...,
	'abstract': ...,
	'sections': [
	{'heading': ..., 'text': ...},
	{'heading': ..., 'text': ...},
	...
	],
	'references': [
	{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
	{'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
	...
	],
	'figures': [
	{'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...},
	...
	]
	}
	"""
	article_dict = {}
	if article is not None:
	title = article.find("title", attrs={"type": "main"})
	title = title.text.strip() if title is not None else ""
	article_dict["authors"] = parse_authors(article)
	article_dict["pub_date"] = parse_date(article)
	article_dict["title"] = title
	article_dict["abstract"] = parse_abstract(article)
	article_dict["sections"] = parse_sections(article, as_list=as_list)
	article_dict["references"] = parse_references(article)
	article_dict["figures"] = parse_figure_caption(article)

	doi = article.find("idno", attrs={"type": "DOI"})
	doi = doi.text if doi is not None else ""
	article_dict["doi"] = doi

	return article_dict
	else:
	return None


	def parse_pdf_to_dict(
	pdf_path: str,
	fulltext: bool = True,
	soup: bool = True,
	as_list: bool = False,
	grobid_url: str = GROBID_URL,
	):
	"""
	Parse the given PDF and return dictionary of the parsed article

	Parameters
	==========
	pdf_path: str, path to publication or article
	fulltext: bool, whether to extract fulltext or not
	soup: bool, whether to return BeautifulSoup or not
	as_list: bool, whether to return list of sections or not
	grobid_url: str, url to grobid server, default is `GROBID_URL`
	This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service

	Ouput
	=====
	article_dict: dict, dictionary of an article
	"""
	parsed_article = parse_pdf(
	pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url
	)
	article_dict = convert_article_soup_to_dict(
	parsed_article, as_list=as_list)
	return article_dict


	def parse_figures(
	pdf_folder: str,
	jar_path: str = PDF_FIGURES_JAR_PATH,
	resolution: int = 300,
	output_folder: str = "figures",
	):
	"""
	Parse figures from the given scientific PDF using pdffigures2

	Parameters
	==========
	pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
	jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
	resolution: int, resolution of the output figures
	output_folder: str, path to folder that we want to save parsed data (related to figures) and figures

	Output
	======
	folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
	"""
	if not op.isdir(output_folder):
	os.makedirs(output_folder)

	# create ``data`` and ``figures`` subfolder within ``output_folder``
	data_path = op.join(output_folder, "data")
	figure_path = op.join(output_folder, "figures")
	if not op.exists(data_path):
	os.makedirs(data_path)
	if not op.exists(figure_path):
	os.makedirs(figure_path)

	if op.isdir(data_path) and op.isdir(figure_path):
	args = [
	"java",
	"-jar",
	jar_path,
	pdf_folder,
	"-i",
	str(resolution),
	"-d",
	os.path.join(os.path.abspath(data_path), ""),
	"-m",
	op.join(os.path.abspath(figure_path), ""), # end path with "/"
	]
	_ = subprocess.run(
	args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
	)
	print("Done parsing figures from PDFs!")
	else:
	print("You may have to check of ``data`` and ``figures`` in the the output folder path.")