nianlonggu
init
02ae0bf
from typing import List, Dict, Optional
import bs4
from bs4 import BeautifulSoup
import re
from collections import defaultdict
SUBSTITUTE_TAGS = {
'persName',
'orgName',
'publicationStmt',
'titleStmt',
'biblScope'
}
def clean_tags(el: bs4.element.Tag):
"""
Replace all tags with lowercase version
:param el:
:return:
"""
for sub_tag in SUBSTITUTE_TAGS:
for sub_el in el.find_all(sub_tag):
sub_el.name = sub_tag.lower()
def soup_from_path(file_path: str):
"""
Read XML file
:param file_path:
:return:
"""
return BeautifulSoup(open(file_path, "rb").read(), "xml")
def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns title
:return:
"""
for title_entry in raw_xml.find_all("title"):
if title_entry.has_attr("level") \
and title_entry["level"] == "a":
return title_entry.text
try:
return raw_xml.title.text
except AttributeError:
return ""
def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]:
"""
Returns a list of dictionaries, one for each author,
containing the first and last names.
e.g.
{
"first": first,
"middle": middle,
"last": last,
"suffix": suffix
}
"""
names = []
for author in raw_xml.find_all("author"):
if not author.persname:
continue
# forenames include first and middle names
forenames = author.persname.find_all("forename")
# surnames include last names
surnames = author.persname.find_all("surname")
# name suffixes
suffixes = author.persname.find_all("suffix")
first = ""
middle = []
last = ""
suffix = ""
for forename in forenames:
if forename["type"] == "first":
if not first:
first = forename.text
else:
middle.append(forename.text)
elif forename["type"] == "middle":
middle.append(forename.text)
if len(surnames) > 1:
for surname in surnames[:-1]:
middle.append(surname.text)
last = surnames[-1].text
elif len(surnames) == 1:
last = surnames[0].text
if len(suffix) >= 1:
suffix = " ".join([suffix.text for suffix in suffixes])
names_dict = {
"first": first,
"middle": middle,
"last": last,
"suffix": suffix
}
names.append(names_dict)
return names
def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
"""
Get affiliation from grobid xml
:param raw_xml:
:return:
"""
location_dict = dict()
laboratory_name = ""
institution_name = ""
if raw_xml and raw_xml.affiliation:
for child in raw_xml.affiliation:
if child.name == "orgname":
if child.has_attr("type"):
if child["type"] == "laboratory":
laboratory_name = child.text
elif child["type"] == "institution":
institution_name = child.text
elif child.name == "address":
for grandchild in child:
if grandchild.name and grandchild.text:
location_dict[grandchild.name] = grandchild.text
if laboratory_name or institution_name:
return {
"laboratory": laboratory_name,
"institution": institution_name,
"location": location_dict
}
return {}
def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
"""
Returns a list of dictionaries, one for each author,
containing the first and last names.
e.g.
{
"first": first,
"middle": middle,
"last": last,
"suffix": suffix,
"affiliation": {
"laboratory": "",
"institution": "",
"location": "",
},
"email": ""
}
"""
authors = []
for author in raw_xml.find_all("author"):
first = ""
middle = []
last = ""
suffix = ""
if author.persname:
# forenames include first and middle names
forenames = author.persname.find_all("forename")
# surnames include last names
surnames = author.persname.find_all("surname")
# name suffixes
suffixes = author.persname.find_all("suffix")
for forename in forenames:
if forename.has_attr("type"):
if forename["type"] == "first":
if not first:
first = forename.text
else:
middle.append(forename.text)
elif forename["type"] == "middle":
middle.append(forename.text)
if len(surnames) > 1:
for surname in surnames[:-1]:
middle.append(surname.text)
last = surnames[-1].text
elif len(surnames) == 1:
last = surnames[0].text
if len(suffix) >= 1:
suffix = " ".join([suffix.text for suffix in suffixes])
affiliation = get_affiliation_from_grobid_xml(author)
email = ""
if author.email:
email = author.email.text
author_dict = {
"first": first,
"middle": middle,
"last": last,
"suffix": suffix,
"affiliation": affiliation,
"email": email
}
authors.append(author_dict)
return authors
def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
"""
Returns date published if exists
:return:
"""
if raw_xml.date and raw_xml.date.has_attr("when"):
# match year in date text (which is in some unspecified date format)
year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
if year_match:
year = year_match.group(0)
if year and year.isnumeric() and len(year) == 4:
return int(year)
return None
def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
"""
Returns venue/journal/publisher of bib entry
Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
level="j": journal title
level="m": "non journal bibliographical item holding the cited article"
level="s": series title
:return:
"""
title_names = []
keep_types = ["j", "m", "s"]
# get all titles of the anove types
for title_entry in raw_xml.find_all("title"):
if title_entry.has_attr("level") \
and title_entry["level"] in keep_types \
and title_entry.text != title_text:
title_names.append((title_entry["level"], title_entry.text))
# return the title name that most likely belongs to the journal or publication venue
if title_names:
title_names.sort(key=lambda x: keep_types.index(x[0]))
return title_names[0][1]
return ""
def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the volume number of grobid bib entry
Grobid <biblscope unit="volume">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
return bibl_entry.text
return ""
def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the issue number of grobid bib entry
Grobid <biblscope unit="issue">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
return bibl_entry.text
return ""
def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the page numbers of grobid bib entry
Grobid <biblscope unit="page">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"):
from_page = bibl_entry["from"]
if bibl_entry.has_attr("to"):
to_page = bibl_entry["to"]
return f'{from_page}--{to_page}'
else:
return from_page
return ""
def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
"""
Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
:param raw_xml:
:return:
"""
other_ids = defaultdict(list)
for idno_entry in raw_xml.find_all("idno"):
if idno_entry.has_attr("type") and idno_entry.text:
other_ids[idno_entry["type"]].append(idno_entry.text)
return other_ids
def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the raw bibiliography string
:param raw_xml:
:return:
"""
for note in raw_xml.find_all("note"):
if note.has_attr("type") and note["type"] == "raw_reference":
return note.text
return ""
def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Finds and returns the publication datetime if it exists
:param raw_xml:
:return:
"""
if raw_xml.publicationStmt:
for child in raw_xml.publicationstmt:
if child.name == "date" \
and child.has_attr("type") \
and child["type"] == "published" \
and child.has_attr("when"):
return child["when"]
return ""
def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
"""
Parse one bib entry
:param bib_entry:
:return:
"""
clean_tags(bib_entry)
title = get_title_from_grobid_xml(bib_entry)
return {
'ref_id': bib_entry.attrs.get("xml:id", None),
'title': title,
'authors': get_author_names_from_grobid_xml(bib_entry),
'year': get_year_from_grobid_xml(bib_entry),
'venue': get_venue_from_grobid_xml(bib_entry, title),
'volume': get_volume_from_grobid_xml(bib_entry),
'issue': get_issue_from_grobid_xml(bib_entry),
'pages': get_pages_from_grobid_xml(bib_entry),
'other_ids': get_other_ids_from_grobid_xml(bib_entry),
'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry),
'urls': []
}
def is_reference_tag(tag: bs4.element.Tag) -> bool:
return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
"""
Extract paper metadata (title, authors, affiliation, year) from grobid xml
:param tag:
:return:
"""
clean_tags(tag)
paper_metadata = {
"title": tag.titlestmt.title.text,
"authors": get_author_data_from_grobid_xml(tag),
"year": get_publication_datetime_from_grobid_xml(tag)
}
return paper_metadata