import re |
import os |
import os.path as op |
from glob import glob |
import urllib |
import subprocess |
import requests |
from bs4 import BeautifulSoup, NavigableString |
GROBID_URL = "http://localhost:8070" |
DIR_PATH = op.dirname(op.abspath(__file__)) |
DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar" |
) |
def list_pdf_paths(pdf_folder: str): |
""" |
list of pdf paths in pdf folder |
""" |
return glob(op.join(pdf_folder, "*", "*", "*.pdf")) |
def validate_url(path: str): |
""" |
Validate a given ``path`` if it is URL or not |
""" |
regex = re.compile( |
r"^(?:http|ftp)s?://" |
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" |
r"localhost|" |
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" |
r"(?::\d+)?" |
r"(?:/?|[/?]\S+)$", |
) |
return re.match(regex, path) is not None |
def parse_pdf( |
pdf_path: str, |
fulltext: bool = True, |
soup: bool = False, |
grobid_url: str = GROBID_URL, |
): |
""" |
Function to parse PDF to XML or BeautifulSoup using GROBID tool |
You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally |
After loading GROBID zip file, you can run GROBID by using the following |
>> ./gradlew run |
Parameters |
========== |
pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF |
fulltext: bool, option for parsing, if True, parse full text of the article |
if False, parse only header |
grobid_url: str, url to GROBID parser, default at 'http://localhost:8070' |
This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service |
soup: bool, if True, return BeautifulSoup of the article |
Output |
====== |
parsed_article: if soup is False, return parsed XML in text format, |
else return BeautifulSoup of the XML |
Example |
======= |
>> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True) |
""" |
if fulltext: |
url = "%s/api/processFulltextDocument" % grobid_url |
else: |
url = "%s/api/processHeaderDocument" % grobid_url |
if isinstance(pdf_path, str): |
if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf": |
print("The input URL has to end with ``.pdf``") |
parsed_article = None |
elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf": |
page = urllib.request.urlopen(pdf_path).read() |
parsed_article = requests.post(url, files={"input": page}).text |
elif op.exists(pdf_path): |
parsed_article = requests.post( |
url, files={"input": open(pdf_path, "rb")} |
).text |
else: |
parsed_article = None |
elif isinstance(pdf_path, bytes): |
parsed_article = requests.post(url, files={"input": pdf_path}).text |
else: |
parsed_article = None |
if soup and parsed_article is not None: |
parsed_article = BeautifulSoup(parsed_article, "lxml") |
return parsed_article |
def parse_authors(article): |
""" |
Parse authors from a given BeautifulSoup of an article |
""" |
author_names = article.find("sourcedesc").findAll("persname") |
authors = [] |
for author in author_names: |
firstname = author.find("forename", {"type": "first"}) |
firstname = firstname.text.strip() if firstname is not None else "" |
middlename = author.find("forename", {"type": "middle"}) |
middlename = middlename.text.strip() if middlename is not None else "" |
lastname = author.find("surname") |
lastname = lastname.text.strip() if lastname is not None else "" |
if middlename != "": |
authors.append(firstname + " " + middlename + " " + lastname) |
else: |
authors.append(firstname + " " + lastname) |
authors = "; ".join(authors) |
return authors |
def parse_date(article): |
""" |
Parse date from a given BeautifulSoup of an article |
""" |
pub_date = article.find("publicationstmt") |
year = pub_date.find("date") |
year = year.attrs.get("when") if year is not None else "" |
return year |
def parse_abstract(article): |
""" |
Parse abstract from a given BeautifulSoup of an article |
""" |
div = article.find("abstract") |
abstract = "" |
for p in list(div.children): |
if not isinstance(p, NavigableString) and len(list(p)) > 0: |
abstract += " ".join( |
[elem.text for elem in p if not isinstance( |
elem, NavigableString)] |
) |
return abstract |
def calculate_number_of_references(div): |
""" |
For a given section, calculate number of references made in the section |
""" |
n_publication_ref = len( |
[ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"] |
) |
n_figure_ref = len( |
[ref for ref in div.find_all( |
"ref") if ref.attrs.get("type") == "figure"] |
) |
return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref} |
def parse_sections(article, as_list: bool = False): |
""" |
Parse list of sections from a given BeautifulSoup of an article |
Parameters |
========== |
as_list: bool, if True, output text as a list of paragraph instead |
of joining it together as one single text |
""" |
article_text = article.find("text") |
divs = article_text.find_all( |
"div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"}) |
sections = [] |
for div in divs: |
div_list = list(div.children) |
if len(div_list) == 0: |
heading = "" |
text = "" |
all_paragraphs = [] |
elif len(div_list) == 1: |
if isinstance(div_list[0], NavigableString): |
heading = str(div_list[0]) |
text = "" |
all_paragraphs = [] |
else: |
heading = "" |
text = div_list[0].text |
all_paragraphs = [text] |
else: |
text = [] |
heading = div_list[0] |
all_paragraphs = [] |
if isinstance(heading, NavigableString): |
heading = str(heading) |
p_all = list(div.children)[1:] |
else: |
heading = "" |
p_all = list(div.children) |
for p in p_all: |
if p is not None: |
try: |
text.append(p.text) |
all_paragraphs.append(p.text) |
except: |
pass |
if not as_list: |
text = "\n".join(text) |
if heading != "" or text != "": |
ref_dict = calculate_number_of_references(div) |
sections.append( |
{ |
"heading": heading, |
"text": text, |
"all_paragraphs": all_paragraphs, |
"n_publication_ref": ref_dict["n_publication_ref"], |
"n_figure_ref": ref_dict["n_figure_ref"], |
} |
) |
return sections |
def parse_references(article): |
""" |
Parse list of references from a given BeautifulSoup of an article |
""" |
reference_list = [] |
references = article.find("text").find("div", attrs={"type": "references"}) |
references = references.find_all( |
"biblstruct") if references is not None else [] |
reference_list = [] |
for reference in references: |
title = reference.find("title", attrs={"level": "a"}) |
if title is None: |
title = reference.find("title", attrs={"level": "m"}) |
title = title.text if title is not None else "" |
journal = reference.find("title", attrs={"level": "j"}) |
journal = journal.text if journal is not None else "" |
if journal == "": |
journal = reference.find("publisher") |
journal = journal.text if journal is not None else "" |
year = reference.find("date") |
year = year.attrs.get("when") if year is not None else "" |
authors = [] |
for author in reference.find_all("author"): |
firstname = author.find("forename", {"type": "first"}) |
firstname = firstname.text.strip() if firstname is not None else "" |
middlename = author.find("forename", {"type": "middle"}) |
middlename = middlename.text.strip() if middlename is not None else "" |
lastname = author.find("surname") |
lastname = lastname.text.strip() if lastname is not None else "" |
if middlename != "": |
authors.append(firstname + " " + middlename + " " + lastname) |
else: |
authors.append(firstname + " " + lastname) |
authors = "; ".join(authors) |
reference_list.append( |
{"title": title, "journal": journal, "year": year, "authors": authors} |
) |
return reference_list |
def parse_figure_caption(article): |
""" |
Parse list of figures/tables from a given BeautifulSoup of an article |
""" |
figures_list = [] |
figures = article.find_all("figure") |
for figure in figures: |
figure_type = figure.attrs.get("type") or "" |
figure_id = figure.attrs["xml:id"] or "" |
label = figure.find("label").text |
if figure_type == "table": |
caption = figure.find("figdesc").text |
data = figure.table.text |
else: |
caption = figure.text |
data = "" |
figures_list.append( |
{ |
"figure_label": label, |
"figure_type": figure_type, |
"figure_id": figure_id, |
"figure_caption": caption, |
"figure_data": data, |
} |
) |
return figures_list |
def convert_article_soup_to_dict(article, as_list: bool = False): |
""" |
Function to convert BeautifulSoup to JSON format |
similar to the output from https://github.com/allenai/science-parse/ |
Parameters |
========== |
article: BeautifulSoup |
Output |
====== |
article_json: dict, parsed dictionary of a given article in the following format |
{ |
'title': ..., |
'abstract': ..., |
'sections': [ |
{'heading': ..., 'text': ...}, |
{'heading': ..., 'text': ...}, |
... |
], |
'references': [ |
{'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, |
{'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, |
... |
], |
'figures': [ |
{'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...}, |
... |
] |
} |
""" |
article_dict = {} |
if article is not None: |
title = article.find("title", attrs={"type": "main"}) |
title = title.text.strip() if title is not None else "" |
article_dict["authors"] = parse_authors(article) |
article_dict["pub_date"] = parse_date(article) |
article_dict["title"] = title |
article_dict["abstract"] = parse_abstract(article) |
article_dict["sections"] = parse_sections(article, as_list=as_list) |
article_dict["references"] = parse_references(article) |
article_dict["figures"] = parse_figure_caption(article) |
doi = article.find("idno", attrs={"type": "DOI"}) |
doi = doi.text if doi is not None else "" |
article_dict["doi"] = doi |
return article_dict |
else: |
return None |
def parse_pdf_to_dict( |
pdf_path: str, |
fulltext: bool = True, |
soup: bool = True, |
as_list: bool = False, |
grobid_url: str = GROBID_URL, |
): |
""" |
Parse the given PDF and return dictionary of the parsed article |
Parameters |
========== |
pdf_path: str, path to publication or article |
fulltext: bool, whether to extract fulltext or not |
soup: bool, whether to return BeautifulSoup or not |
as_list: bool, whether to return list of sections or not |
grobid_url: str, url to grobid server, default is `GROBID_URL` |
This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service |
Ouput |
===== |
article_dict: dict, dictionary of an article |
""" |
parsed_article = parse_pdf( |
pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url |
) |
article_dict = convert_article_soup_to_dict( |
parsed_article, as_list=as_list) |
return article_dict |
def parse_figures( |
pdf_folder: str, |
jar_path: str = PDF_FIGURES_JAR_PATH, |
resolution: int = 300, |
output_folder: str = "figures", |
): |
""" |
Parse figures from the given scientific PDF using pdffigures2 |
Parameters |
========== |
pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files |
jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file |
resolution: int, resolution of the output figures |
output_folder: str, path to folder that we want to save parsed data (related to figures) and figures |
Output |
====== |
folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively |
""" |
if not op.isdir(output_folder): |
os.makedirs(output_folder) |
data_path = op.join(output_folder, "data") |
figure_path = op.join(output_folder, "figures") |
if not op.exists(data_path): |
os.makedirs(data_path) |
if not op.exists(figure_path): |
os.makedirs(figure_path) |
if op.isdir(data_path) and op.isdir(figure_path): |
args = [ |
"java", |
"-jar", |
jar_path, |
pdf_folder, |
"-i", |
str(resolution), |
"-d", |
os.path.join(os.path.abspath(data_path), ""), |
"-m", |
op.join(os.path.abspath(figure_path), ""), |
] |
_ = subprocess.run( |
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 |
) |
print("Done parsing figures from PDFs!") |
else: |
print("You may have to check of ``data`` and ``figures`` in the the output folder path.") |