import re import os import os.path as op from glob import glob import urllib import subprocess import requests from bs4 import BeautifulSoup, NavigableString # or https://cloud.science-miner.com/grobid/ for cloud service GROBID_URL = "http://localhost:8070" DIR_PATH = op.dirname(op.abspath(__file__)) PDF_FIGURES_JAR_PATH = op.join( DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar" ) def list_pdf_paths(pdf_folder: str): """ list of pdf paths in pdf folder """ return glob(op.join(pdf_folder, "*", "*", "*.pdf")) def validate_url(path: str): """ Validate a given ``path`` if it is URL or not """ regex = re.compile( r"^(?:http|ftp)s?://" # http:// or https:// # domain... r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" r"localhost|" # localhost... r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip r"(?::\d+)?" # optional port r"(?:/?|[/?]\S+)$", re.IGNORECASE, ) return re.match(regex, path) is not None def parse_pdf( pdf_path: str, fulltext: bool = True, soup: bool = False, grobid_url: str = GROBID_URL, ): """ Function to parse PDF to XML or BeautifulSoup using GROBID tool You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally After loading GROBID zip file, you can run GROBID by using the following >> ./gradlew run Parameters ========== pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF fulltext: bool, option for parsing, if True, parse full text of the article if False, parse only header grobid_url: str, url to GROBID parser, default at 'http://localhost:8070' This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service soup: bool, if True, return BeautifulSoup of the article Output ====== parsed_article: if soup is False, return parsed XML in text format, else return BeautifulSoup of the XML Example ======= >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True) """ # GROBID URL if fulltext: url = "%s/api/processFulltextDocument" % grobid_url else: url = "%s/api/processHeaderDocument" % grobid_url if isinstance(pdf_path, str): if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf": print("The input URL has to end with ``.pdf``") parsed_article = None elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf": page = urllib.request.urlopen(pdf_path).read() parsed_article = requests.post(url, files={"input": page}).text elif op.exists(pdf_path): parsed_article = requests.post( url, files={"input": open(pdf_path, "rb")} ).text else: parsed_article = None elif isinstance(pdf_path, bytes): # assume that incoming is byte string parsed_article = requests.post(url, files={"input": pdf_path}).text else: parsed_article = None if soup and parsed_article is not None: parsed_article = BeautifulSoup(parsed_article, "lxml") return parsed_article def parse_authors(article): """ Parse authors from a given BeautifulSoup of an article """ author_names = article.find("sourcedesc").findAll("persname") authors = [] for author in author_names: firstname = author.find("forename", {"type": "first"}) firstname = firstname.text.strip() if firstname is not None else "" middlename = author.find("forename", {"type": "middle"}) middlename = middlename.text.strip() if middlename is not None else "" lastname = author.find("surname") lastname = lastname.text.strip() if lastname is not None else "" if middlename != "": authors.append(firstname + " " + middlename + " " + lastname) else: authors.append(firstname + " " + lastname) authors = "; ".join(authors) return authors def parse_date(article): """ Parse date from a given BeautifulSoup of an article """ pub_date = article.find("publicationstmt") year = pub_date.find("date") year = year.attrs.get("when") if year is not None else "" return year def parse_abstract(article): """ Parse abstract from a given BeautifulSoup of an article """ div = article.find("abstract") abstract = "" for p in list(div.children): if not isinstance(p, NavigableString) and len(list(p)) > 0: abstract += " ".join( [elem.text for elem in p if not isinstance( elem, NavigableString)] ) return abstract def calculate_number_of_references(div): """ For a given section, calculate number of references made in the section """ n_publication_ref = len( [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"] ) n_figure_ref = len( [ref for ref in div.find_all( "ref") if ref.attrs.get("type") == "figure"] ) return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref} def parse_sections(article, as_list: bool = False): """ Parse list of sections from a given BeautifulSoup of an article Parameters ========== as_list: bool, if True, output text as a list of paragraph instead of joining it together as one single text """ article_text = article.find("text") divs = article_text.find_all( "div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"}) sections = [] for div in divs: div_list = list(div.children) if len(div_list) == 0: heading = "" text = "" all_paragraphs = [] elif len(div_list) == 1: if isinstance(div_list[0], NavigableString): heading = str(div_list[0]) text = "" all_paragraphs = [] else: heading = "" text = div_list[0].text all_paragraphs = [text] else: text = [] heading = div_list[0] all_paragraphs = [] if isinstance(heading, NavigableString): heading = str(heading) p_all = list(div.children)[1:] else: heading = "" p_all = list(div.children) for p in p_all: if p is not None: try: text.append(p.text) all_paragraphs.append(p.text) except: pass if not as_list: text = "\n".join(text) if heading != "" or text != "": ref_dict = calculate_number_of_references(div) sections.append( { "heading": heading, "text": text, "all_paragraphs": all_paragraphs, "n_publication_ref": ref_dict["n_publication_ref"], "n_figure_ref": ref_dict["n_figure_ref"], } ) return sections def parse_references(article): """ Parse list of references from a given BeautifulSoup of an article """ reference_list = [] references = article.find("text").find("div", attrs={"type": "references"}) references = references.find_all( "biblstruct") if references is not None else [] reference_list = [] for reference in references: title = reference.find("title", attrs={"level": "a"}) if title is None: title = reference.find("title", attrs={"level": "m"}) title = title.text if title is not None else "" journal = reference.find("title", attrs={"level": "j"}) journal = journal.text if journal is not None else "" if journal == "": journal = reference.find("publisher") journal = journal.text if journal is not None else "" year = reference.find("date") year = year.attrs.get("when") if year is not None else "" authors = [] for author in reference.find_all("author"): firstname = author.find("forename", {"type": "first"}) firstname = firstname.text.strip() if firstname is not None else "" middlename = author.find("forename", {"type": "middle"}) middlename = middlename.text.strip() if middlename is not None else "" lastname = author.find("surname") lastname = lastname.text.strip() if lastname is not None else "" if middlename != "": authors.append(firstname + " " + middlename + " " + lastname) else: authors.append(firstname + " " + lastname) authors = "; ".join(authors) reference_list.append( {"title": title, "journal": journal, "year": year, "authors": authors} ) return reference_list def parse_figure_caption(article): """ Parse list of figures/tables from a given BeautifulSoup of an article """ figures_list = [] figures = article.find_all("figure") for figure in figures: figure_type = figure.attrs.get("type") or "" figure_id = figure.attrs["xml:id"] or "" label = figure.find("label").text if figure_type == "table": caption = figure.find("figdesc").text data = figure.table.text else: caption = figure.text data = "" figures_list.append( { "figure_label": label, "figure_type": figure_type, "figure_id": figure_id, "figure_caption": caption, "figure_data": data, } ) return figures_list def convert_article_soup_to_dict(article, as_list: bool = False): """ Function to convert BeautifulSoup to JSON format similar to the output from https://github.com/allenai/science-parse/ Parameters ========== article: BeautifulSoup Output ====== article_json: dict, parsed dictionary of a given article in the following format { 'title': ..., 'abstract': ..., 'sections': [ {'heading': ..., 'text': ...}, {'heading': ..., 'text': ...}, ... ], 'references': [ {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, ... ], 'figures': [ {'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...}, ... ] } """ article_dict = {} if article is not None: title = article.find("title", attrs={"type": "main"}) title = title.text.strip() if title is not None else "" article_dict["authors"] = parse_authors(article) article_dict["pub_date"] = parse_date(article) article_dict["title"] = title article_dict["abstract"] = parse_abstract(article) article_dict["sections"] = parse_sections(article, as_list=as_list) article_dict["references"] = parse_references(article) article_dict["figures"] = parse_figure_caption(article) doi = article.find("idno", attrs={"type": "DOI"}) doi = doi.text if doi is not None else "" article_dict["doi"] = doi return article_dict else: return None def parse_pdf_to_dict( pdf_path: str, fulltext: bool = True, soup: bool = True, as_list: bool = False, grobid_url: str = GROBID_URL, ): """ Parse the given PDF and return dictionary of the parsed article Parameters ========== pdf_path: str, path to publication or article fulltext: bool, whether to extract fulltext or not soup: bool, whether to return BeautifulSoup or not as_list: bool, whether to return list of sections or not grobid_url: str, url to grobid server, default is `GROBID_URL` This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service Ouput ===== article_dict: dict, dictionary of an article """ parsed_article = parse_pdf( pdf_path, fulltext=fulltext, soup=soup, grobid_url=grobid_url ) article_dict = convert_article_soup_to_dict( parsed_article, as_list=as_list) return article_dict def parse_figures( pdf_folder: str, jar_path: str = PDF_FIGURES_JAR_PATH, resolution: int = 300, output_folder: str = "figures", ): """ Parse figures from the given scientific PDF using pdffigures2 Parameters ========== pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file resolution: int, resolution of the output figures output_folder: str, path to folder that we want to save parsed data (related to figures) and figures Output ====== folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively """ if not op.isdir(output_folder): os.makedirs(output_folder) # create ``data`` and ``figures`` subfolder within ``output_folder`` data_path = op.join(output_folder, "data") figure_path = op.join(output_folder, "figures") if not op.exists(data_path): os.makedirs(data_path) if not op.exists(figure_path): os.makedirs(figure_path) if op.isdir(data_path) and op.isdir(figure_path): args = [ "java", "-jar", jar_path, pdf_folder, "-i", str(resolution), "-d", os.path.join(os.path.abspath(data_path), ""), "-m", op.join(os.path.abspath(figure_path), ""), # end path with "/" ] _ = subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 ) print("Done parsing figures from PDFs!") else: print("You may have to check of ``data`` and ``figures`` in the the output folder path.")