nianlonggu
init
02ae0bf
"""
S2ORC classes
"""
from datetime import datetime
from typing import Dict, List, Optional
from doc2json.config import *
CORRECT_KEYS = {
"issn": "issue",
"type": "type_str"
}
SKIP_KEYS = {
'link',
'bib_id'
}
REFERENCE_OUTPUT_KEYS = {
'figure': {'text', 'type_str', 'uris', 'num'},
'table': {'text', 'type_str', 'content', 'num', 'html'},
'footnote': {'text', 'type_str', 'num'},
'section': {'text', 'type_str', 'num', 'parent'},
'equation': {'text', 'type_str', 'latex', 'mathml', 'num'}
}
METADATA_KEYS = {
"title", "authors", "year", "venue", "identifiers"
}
class ReferenceEntry:
"""
Class for representing S2ORC figure and table references
An example json representation (values are examples, not accurate):
{
"FIGREF0": {
"text": "FIG. 2. Depth profiles of...",
"latex": null,
"type": "figure"
},
"TABREF2": {
"text": "Diversity indices of...",
"latex": null,
"type": "table",
"content": "",
"html": ""
}
}
"""
def __init__(
self,
ref_id: str,
text: str,
type_str: str,
latex: Optional[str] = None,
mathml: Optional[str] = None,
content: Optional[str] = None,
html: Optional[str] = None,
uris: Optional[List[str]] = None,
num: Optional[str] = None,
parent: Optional[str] = None
):
self.ref_id = ref_id
self.text = text
self.type_str = type_str
self.latex = latex
self.mathml = mathml
self.content = content
self.html = html
self.uris = uris
self.num = num
self.parent = parent
def as_json(self):
keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None)
if keep_keys:
return {
k: self.__getattribute__(k) for k in keep_keys
}
else:
return {
"text": self.text,
"type": self.type_str,
"latex": self.latex,
"mathml": self.mathml,
"content": self.content,
"html": self.html,
"uris": self.uris,
"num": self.num,
"parent": self.parent
}
class BibliographyEntry:
"""
Class for representing S2ORC parsed bibliography entries
An example json representation (values are examples, not accurate):
{
"title": "Mobility Reports...",
"authors": [
{
"first": "A",
"middle": ["A"],
"last": "Haija",
"suffix": ""
}
],
"year": 2015,
"venue": "IEEE Wireless Commun. Mag",
"volume": "42",
"issn": "9",
"pages": "80--92",
"other_ids": {
"doi": [
"10.1109/TWC.2014.2360196"
],
}
}
"""
def __init__(
self,
bib_id: str,
title: str,
authors: List[Dict[str, str]],
ref_id: Optional[str] = None,
year: Optional[int] = None,
venue: Optional[str] = None,
volume: Optional[str] = None,
issue: Optional[str] = None,
pages: Optional[str] = None,
other_ids: Dict[str, List] = None,
num: Optional[int] = None,
urls: Optional[List] = None,
raw_text: Optional[str] = None,
links: Optional[List] = None
):
self.bib_id = bib_id
self.ref_id = ref_id
self.title = title
self.authors = authors
self.year = year
self.venue = venue
self.volume = volume
self.issue = issue
self.pages = pages
self.other_ids = other_ids
self.num = num
self.urls = urls
self.raw_text = raw_text
self.links = links
def as_json(self):
return {
"ref_id": self.ref_id,
"title": self.title,
"authors": self.authors,
"year": self.year,
"venue": self.venue,
"volume": self.volume,
"issue": self.issue,
"pages": self.pages,
"other_ids": self.other_ids,
"num": self.num,
"urls": self.urls,
"raw_text": self.raw_text,
"links": self.links
}
class Affiliation:
"""
Class for representing affiliation info
Example:
{
"laboratory": "Key Laboratory of Urban Environment and Health",
"institution": "Chinese Academy of Sciences",
"location": {
"postCode": "361021",
"settlement": "Xiamen",
"country": "People's Republic of China"
}
"""
def __init__(
self,
laboratory: str,
institution: str,
location: Dict
):
self.laboratory = laboratory
self.institution = institution
self.location = location
def as_json(self):
return {
"laboratory": self.laboratory,
"institution": self.institution,
"location": self.location
}
class Author:
"""
Class for representing paper authors
Example:
{
"first": "Anyi",
"middle": [],
"last": "Hu",
"suffix": "",
"affiliation": {
"laboratory": "Key Laboratory of Urban Environment and Health",
"institution": "Chinese Academy of Sciences",
"location": {
"postCode": "361021",
"settlement": "Xiamen",
"country": "People's Republic of China"
}
},
"email": ""
}
"""
def __init__(
self,
first: str,
middle: List[str],
last: str,
suffix: str,
affiliation: Optional[Dict] = None,
email: Optional[str] = None
):
self.first = first
self.middle = middle
self.last = last
self.suffix = suffix
self.affiliation = Affiliation(**affiliation) if affiliation else {}
self.email = email
def as_json(self):
return {
"first": self.first,
"middle": self.middle,
"last": self.last,
"suffix": self.suffix,
"affiliation": self.affiliation.as_json() if self.affiliation else {},
"email": self.email
}
class Metadata:
"""
Class for representing paper metadata
Example:
{
"title": "Niche Partitioning...",
"authors": [
{
"first": "Anyi",
"middle": [],
"last": "Hu",
"suffix": "",
"affiliation": {
"laboratory": "Key Laboratory of Urban Environment and Health",
"institution": "Chinese Academy of Sciences",
"location": {
"postCode": "361021",
"settlement": "Xiamen",
"country": "People's Republic of China"
}
},
"email": ""
}
],
"year": "2011-11"
}
"""
def __init__(
self,
title: str,
authors: List[Dict],
year: Optional[str] = None,
venue: Optional[str] = None,
identifiers: Optional[Dict] = {}
):
self.title = title
self.authors = [Author(**author) for author in authors]
self.year = year
self.venue = venue
self.identifiers = identifiers
def as_json(self):
return {
"title": self.title,
"authors": [author.as_json() for author in self.authors],
"year": self.year,
"venue": self.venue,
"identifiers": self.identifiers
}
class Paragraph:
"""
Class for representing a parsed paragraph from Grobid xml
All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced
with a special token that maps to a reference identifier
Citation mention spans and section header are extracted
An example json representation (values are examples, not accurate):
{
"text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...",
"mention_spans": [
{
"start": 27,
"end": 31,
"text": "[1]")
],
"ref_spans": [
{
"start": ,
"end": ,
"text": "Fig. 1"
}
],
"eq_spans": [
{
"start": 53,
"end": 61,
"text": "α = 1",
"latex": "\\alpha = 1",
"ref_id": null
}
],
"section": "Abstract"
}
"""
def __init__(
self,
text: str,
cite_spans: List[Dict],
ref_spans: List[Dict],
eq_spans: Optional[List[Dict]] = [],
section: Optional = None,
sec_num: Optional = None
):
self.text = text
self.cite_spans = cite_spans
self.ref_spans = ref_spans
self.eq_spans = eq_spans
if type(section) == str:
if section:
sec_parts = section.split('::')
section_list = [[None, sec_name] for sec_name in sec_parts]
else:
section_list = None
if section_list and sec_num:
section_list[-1][0] = sec_num
else:
section_list = section
self.section = section_list
def as_json(self):
return {
"text": self.text,
"cite_spans": self.cite_spans,
"ref_spans": self.ref_spans,
"eq_spans": self.eq_spans,
"section": '::'.join([sec[1] for sec in self.section]) if self.section else "",
"sec_num": self.section[-1][0] if self.section else None
}
class Paper:
"""
Class for representing a parsed S2ORC paper
"""
def __init__(
self,
paper_id: str,
pdf_hash: str,
metadata: Dict,
abstract: List[Dict],
body_text: List[Dict],
back_matter: List[Dict],
bib_entries: Dict,
ref_entries: Dict
):
self.paper_id = paper_id
self.pdf_hash = pdf_hash
self.metadata = Metadata(**metadata)
self.abstract = [Paragraph(**para) for para in abstract]
self.body_text = [Paragraph(**para) for para in body_text]
self.back_matter = [Paragraph(**para) for para in back_matter]
self.bib_entries = [
BibliographyEntry(
bib_id=key,
**{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in bib.items() if k not in SKIP_KEYS}
) for key, bib in bib_entries.items()
]
self.ref_entries = [
ReferenceEntry(
ref_id=key,
**{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in ref.items() if k != 'ref_id'}
) for key, ref in ref_entries.items()
]
def as_json(self):
return {
"paper_id": self.paper_id,
"pdf_hash": self.pdf_hash,
"metadata": self.metadata.as_json(),
"abstract": [para.as_json() for para in self.abstract],
"body_text": [para.as_json() for para in self.body_text],
"back_matter": [para.as_json() for para in self.back_matter],
"bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
"ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
}
@property
def raw_abstract_text(self) -> str:
"""
Get all the body text joined by a newline
:return:
"""
return '\n'.join([para.text for para in self.abstract])
@property
def raw_body_text(self) -> str:
"""
Get all the body text joined by a newline
:return:
"""
return '\n'.join([para.text for para in self.body_text])
def release_json(self, doc_type: str="pdf"):
"""
Return in release JSON format
:return:
"""
# TODO: not fully implemented; metadata format is not right; extra keys in some places
release_dict = {"paper_id": self.paper_id}
release_dict.update({"header": {
"generated_with": f'{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}',
"date_generated": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
}})
release_dict.update(self.metadata.as_json())
release_dict.update({"abstract": self.raw_abstract_text})
release_dict.update({
f"{doc_type}_parse": {
"paper_id": self.paper_id,
"_pdf_hash": self.pdf_hash,
"abstract": [para.as_json() for para in self.abstract],
"body_text": [para.as_json() for para in self.body_text],
"back_matter": [para.as_json() for para in self.back_matter],
"bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
"ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
}
})
return release_dict
def load_s2orc(paper_dict: Dict) -> Paper:
"""
Load release S2ORC into Paper class
:param paper_dict:
:return:
"""
paper_id = paper_dict['paper_id']
pdf_hash = paper_dict.get('_pdf_hash', paper_dict.get('s2_pdf_hash', None))
# 2019 gorc parses
if "grobid_parse" in paper_dict and paper_dict.get("grobid_parse"):
metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
abstract = paper_dict.get("grobid_parse").get("abstract", [])
body_text = paper_dict.get("grobid_parse").get("body_text", [])
back_matter = paper_dict.get("grobid_parse").get("back_matter", [])
bib_entries = paper_dict.get("grobid_parse").get("bib_entries", {})
for k, v in bib_entries.items():
if 'link' in v:
v['links'] = [v['link']]
ref_entries = paper_dict.get("grobid_parse").get("ref_entries", {})
# current and 2020 s2orc release_json
elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ("body_text" in paper_dict and paper_dict.get("body_text")):
if "pdf_parse" in paper_dict:
paper_dict = paper_dict["pdf_parse"]
if paper_dict.get("metadata"):
metadata = {k: v for k, v in paper_dict.get("metadata").items() if k in METADATA_KEYS}
# 2020 s2orc releases (metadata is separate)
else:
metadata = {
"title": None,
"authors": [],
"year": None
}
abstract = paper_dict.get("abstract", [])
body_text = paper_dict.get("body_text", [])
back_matter = paper_dict.get("back_matter", [])
bib_entries = paper_dict.get("bib_entries", {})
for k, v in bib_entries.items():
if 'link' in v:
v['links'] = [v['link']]
ref_entries = paper_dict.get("ref_entries", {})
else:
print(paper_id)
raise NotImplementedError("Unknown S2ORC file type!")
return Paper(
paper_id=paper_id,
pdf_hash=pdf_hash,
metadata=metadata,
abstract=abstract,
body_text=body_text,
back_matter=back_matter,
bib_entries=bib_entries,
ref_entries=ref_entries
)