Spaces:

nianlong
/

memsum-arxiv-summarizer

Build error

File size: 11,350 Bytes

02ae0bf

from typing import List, Dict, Optional
import bs4
from bs4 import BeautifulSoup
import re
from collections import defaultdict


SUBSTITUTE_TAGS = {
    'persName',
    'orgName',
    'publicationStmt',
    'titleStmt',
    'biblScope'
}


def clean_tags(el: bs4.element.Tag):
    """
    Replace all tags with lowercase version
    :param el:
    :return:
    """
    for sub_tag in SUBSTITUTE_TAGS:
        for sub_el in el.find_all(sub_tag):
            sub_el.name = sub_tag.lower()


def soup_from_path(file_path: str):
    """
    Read XML file
    :param file_path:
    :return:
    """
    return BeautifulSoup(open(file_path, "rb").read(), "xml")


def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Returns title
    :return:
    """
    for title_entry in raw_xml.find_all("title"):
        if title_entry.has_attr("level") \
                and title_entry["level"] == "a":
            return title_entry.text
    try:
        return raw_xml.title.text
    except AttributeError:
        return ""


def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]:
    """
    Returns a list of dictionaries, one for each author,
    containing the first and last names.

    e.g.
        {
            "first": first,
            "middle": middle,
            "last": last,
            "suffix": suffix
        }
    """
    names = []

    for author in raw_xml.find_all("author"):
        if not author.persname:
            continue

        # forenames include first and middle names
        forenames = author.persname.find_all("forename")

        # surnames include last names
        surnames = author.persname.find_all("surname")

        # name suffixes
        suffixes = author.persname.find_all("suffix")

        first = ""
        middle = []
        last = ""
        suffix = ""

        for forename in forenames:
            if forename["type"] == "first":
                if not first:
                    first = forename.text
                else:
                    middle.append(forename.text)
            elif forename["type"] == "middle":
                middle.append(forename.text)

        if len(surnames) > 1:
            for surname in surnames[:-1]:
                middle.append(surname.text)
            last = surnames[-1].text
        elif len(surnames) == 1:
            last = surnames[0].text

        if len(suffix) >= 1:
            suffix = " ".join([suffix.text for suffix in suffixes])

        names_dict = {
            "first": first,
            "middle": middle,
            "last": last,
            "suffix": suffix
        }

        names.append(names_dict)
    return names


def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
    """
    Get affiliation from grobid xml
    :param raw_xml:
    :return:
    """
    location_dict = dict()
    laboratory_name = ""
    institution_name = ""

    if raw_xml and raw_xml.affiliation:
        for child in raw_xml.affiliation:
            if child.name == "orgname":
                if child.has_attr("type"):
                    if child["type"] == "laboratory":
                        laboratory_name = child.text
                    elif child["type"] == "institution":
                        institution_name = child.text
            elif child.name == "address":
                for grandchild in child:
                    if grandchild.name and grandchild.text:
                        location_dict[grandchild.name] = grandchild.text

        if laboratory_name or institution_name:
            return {
                "laboratory": laboratory_name,
                "institution": institution_name,
                "location": location_dict
            }

    return {}


def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
    """
    Returns a list of dictionaries, one for each author,
    containing the first and last names.

    e.g.
        {
            "first": first,
            "middle": middle,
            "last": last,
            "suffix": suffix,
            "affiliation": {
                "laboratory": "",
                "institution": "",
                "location": "",
            },
            "email": ""
        }
    """
    authors = []

    for author in raw_xml.find_all("author"):

        first = ""
        middle = []
        last = ""
        suffix = ""

        if author.persname:
            # forenames include first and middle names
            forenames = author.persname.find_all("forename")

            # surnames include last names
            surnames = author.persname.find_all("surname")

            # name suffixes
            suffixes = author.persname.find_all("suffix")

            for forename in forenames:
                if forename.has_attr("type"):
                    if forename["type"] == "first":
                        if not first:
                            first = forename.text
                        else:
                            middle.append(forename.text)
                    elif forename["type"] == "middle":
                        middle.append(forename.text)

            if len(surnames) > 1:
                for surname in surnames[:-1]:
                    middle.append(surname.text)
                last = surnames[-1].text
            elif len(surnames) == 1:
                last = surnames[0].text

            if len(suffix) >= 1:
                suffix = " ".join([suffix.text for suffix in suffixes])

        affiliation = get_affiliation_from_grobid_xml(author)

        email = ""
        if author.email:
            email = author.email.text

        author_dict = {
            "first": first,
            "middle": middle,
            "last": last,
            "suffix": suffix,
            "affiliation": affiliation,
            "email": email
        }

        authors.append(author_dict)

    return authors


def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
    """
    Returns date published if exists
    :return:
    """
    if raw_xml.date and raw_xml.date.has_attr("when"):
        # match year in date text (which is in some unspecified date format)
        year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
        if year_match:
            year = year_match.group(0)
            if year and year.isnumeric() and len(year) == 4:
                return int(year)
    return None


def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
    """
    Returns venue/journal/publisher of bib entry
    Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
    level="j": journal title
    level="m": "non journal bibliographical item holding the cited article"
    level="s": series title
    :return:
    """
    title_names = []
    keep_types = ["j", "m", "s"]
    # get all titles of the anove types
    for title_entry in raw_xml.find_all("title"):
        if title_entry.has_attr("level") \
                and title_entry["level"] in keep_types \
                and title_entry.text != title_text:
            title_names.append((title_entry["level"], title_entry.text))
    # return the title name that most likely belongs to the journal or publication venue
    if title_names:
        title_names.sort(key=lambda x: keep_types.index(x[0]))
        return title_names[0][1]
    return ""


def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Returns the volume number of grobid bib entry
    Grobid <biblscope unit="volume">
    :return:
    """
    for bibl_entry in raw_xml.find_all("biblscope"):
        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
            return bibl_entry.text
    return ""


def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Returns the issue number of grobid bib entry
    Grobid <biblscope unit="issue">
    :return:
    """
    for bibl_entry in raw_xml.find_all("biblscope"):
        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
            return bibl_entry.text
    return ""


def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Returns the page numbers of grobid bib entry
    Grobid <biblscope unit="page">
    :return:
    """
    for bibl_entry in raw_xml.find_all("biblscope"):
        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"):
            from_page = bibl_entry["from"]
            if bibl_entry.has_attr("to"):
                to_page = bibl_entry["to"]
                return f'{from_page}--{to_page}'
            else:
                return from_page
    return ""


def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
    """
    Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
    :param raw_xml:
    :return:
    """
    other_ids = defaultdict(list)

    for idno_entry in raw_xml.find_all("idno"):
        if idno_entry.has_attr("type") and idno_entry.text:
            other_ids[idno_entry["type"]].append(idno_entry.text)

    return other_ids


def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Returns the raw bibiliography string
    :param raw_xml:
    :return:
    """
    for note in raw_xml.find_all("note"):
        if note.has_attr("type") and note["type"] == "raw_reference":
            return note.text
    return ""


def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
    """
    Finds and returns the publication datetime if it exists
    :param raw_xml:
    :return:
    """
    if raw_xml.publicationStmt:
        for child in raw_xml.publicationstmt:
            if child.name == "date" \
                    and child.has_attr("type") \
                    and child["type"] == "published" \
                    and child.has_attr("when"):
                return child["when"]
    return ""


def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
    """
    Parse one bib entry
    :param bib_entry:
    :return:
    """
    clean_tags(bib_entry)
    title = get_title_from_grobid_xml(bib_entry)
    return {
        'ref_id': bib_entry.attrs.get("xml:id", None),
        'title': title,
        'authors': get_author_names_from_grobid_xml(bib_entry),
        'year': get_year_from_grobid_xml(bib_entry),
        'venue': get_venue_from_grobid_xml(bib_entry, title),
        'volume': get_volume_from_grobid_xml(bib_entry),
        'issue': get_issue_from_grobid_xml(bib_entry),
        'pages': get_pages_from_grobid_xml(bib_entry),
        'other_ids': get_other_ids_from_grobid_xml(bib_entry),
        'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry),
        'urls': []
    }


def is_reference_tag(tag: bs4.element.Tag) -> bool:
    return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"


def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
    """
    Extract paper metadata (title, authors, affiliation, year) from grobid xml
    :param tag:
    :return:
    """
    clean_tags(tag)
    paper_metadata = {
        "title": tag.titlestmt.title.text,
        "authors": get_author_data_from_grobid_xml(tag),
        "year": get_publication_datetime_from_grobid_xml(tag)
    }
    return paper_metadata