Spaces:
Build error
Build error
# https://github.com/arXiv/arxiv-base@32e6ad0 | |
""" | |
Copyright 2017 Cornell University | |
Permission is hereby granted, free of charge, to any person obtaining a copy of | |
this software and associated documentation files (the "Software"), to deal in | |
the Software without restriction, including without limitation the rights to | |
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
of the Software, and to permit persons to whom the Software is furnished to do | |
so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
"""Parse Authors lines to extract author and affiliation data.""" | |
import re | |
import os | |
import gzip | |
import json | |
from itertools import dropwhile | |
from typing import Dict, Iterator, List, Tuple | |
from multiprocessing import Pool, cpu_count | |
from arxiv_public_data.tex2utf import tex2utf | |
from arxiv_public_data.config import LOGGER, DIR_OUTPUT | |
logger = LOGGER.getChild('authorsplit') | |
PREFIX_MATCH = 'van|der|de|la|von|del|della|da|mac|ter|dem|di|vaziri' | |
""" | |
Takes data from an Author: line in the current arXiv abstract | |
file and returns a structured set of data: | |
author_list_ptr = [ | |
[ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ] , | |
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] , | |
[ author3_keyname, author3_firstnames, author1_suffix ] | |
] | |
Abstracted from Dienst software for OAI1 and other uses. This | |
routine should just go away when a better metadata structure is | |
adopted that deals with names and affiliations properly. | |
Must remember that there is at least one person one the archive | |
who has only one name, this should clearly be considered the key name. | |
Code originally written by Christina Scovel, Simeon Warner Dec99/Jan00 | |
2000-10-16 - separated. | |
2000-12-07 - added support for suffix | |
2003-02-14 - get surname prefixes from arXiv::Filters::Index [Simeon] | |
2007-10-01 - created test script, some tidying [Simeon] | |
2018-05-25 - Translated from Perl to Python [Brian C.] | |
""" | |
def parse_author_affil(authors: str) -> List[List[str]]: | |
""" | |
Parse author line and returns an list of author and affiliation data. | |
The list for each author will have at least three elements for | |
keyname, firstname(s) and suffix. The keyname will always have content | |
but the other strings might be empty strings if there is no firstname | |
or suffix. Any additional elements after the first three are affiliations, | |
there may be zero or more. | |
Handling of prefix "XX collaboration" etc. is duplicated here and in | |
arXiv::HTML::AuthorLink -- it shouldn't be. Likely should just be here. | |
This routine is just a wrapper around the two parts that first split | |
the authors line into parts, and then back propagate the affiliations. | |
The first part is to be used along for display where we do not want | |
to back propagate affiliation information. | |
:param authors: string of authors from abs file or similar | |
:return: | |
Returns a structured set of data: | |
author_list_ptr = [ | |
[ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ], | |
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] , | |
[ author3_keyname, author3_firstnames, author1_suffix ] | |
] | |
""" | |
return _parse_author_affil_back_propagate( | |
**_parse_author_affil_split(authors)) | |
def _parse_author_affil_split(author_line: str) -> Dict: | |
""" | |
Split author line into author and affiliation data. | |
Take author line, tidy spacing and punctuation, and then split up into | |
individual author an affiliation data. Has special cases to avoid splitting | |
an initial collaboration name and records in $back_propagate_affiliation_to | |
the fact that affiliations should not be back propagated to collaboration | |
names. | |
Does not handle multiple collaboration names. | |
""" | |
if not author_line: | |
return {'author_list': [], 'back_prop': 0} | |
names: List[str] = split_authors(author_line) | |
if not names: | |
return {'author_list': [], 'back_prop': 0} | |
names = _remove_double_commas(names) | |
# get rid of commas at back | |
namesIter: Iterator[str] = reversed( | |
list(dropwhile(lambda x: x == ',', reversed(names)))) | |
# get rid of commas at front | |
names = list(dropwhile(lambda x: x == ',', namesIter)) | |
# Extract all names (all parts not starting with comma or paren) | |
names = list(map(_tidy_name, filter( | |
lambda x: re.match('^[^](,]', x), names))) | |
names = list(filter(lambda n: not re.match( | |
r'^\s*et\.?\s+al\.?\s*', n, flags=re.IGNORECASE), names)) | |
(names, author_list, | |
back_propagate_affiliations_to) = _collaboration_at_start(names) | |
(enumaffils) = _enum_collaboration_at_end(author_line) | |
# Split name into keyname and firstnames/initials. | |
# Deal with different patterns in turn: prefixes, suffixes, plain | |
# and single name. | |
patterns = [('double-prefix', | |
r'^(.*)\s+(' + PREFIX_MATCH + r')\s(' + | |
PREFIX_MATCH + r')\s(\S+)$'), | |
('name-prefix-name', | |
r'^(.*)\s+(' + PREFIX_MATCH + r')\s(\S+)$'), | |
('name-name-prefix', | |
r'^(.*)\s+(\S+)\s(I|II|III|IV|V|Sr|Jr|Sr\.|Jr\.)$'), | |
('name-name', | |
r'^(.*)\s+(\S+)$'), ] | |
# Now go through names in turn and try to get affiliations | |
# to go with them | |
for name in names: | |
pattern_matches = ((mtype, re.match(m, name, flags=re.IGNORECASE)) | |
for (mtype, m) in patterns) | |
(mtype, match) = next(((mtype, m) | |
for (mtype, m) in pattern_matches | |
if m is not None), ('default', None)) | |
if match is None: | |
author_entry = [name, '', ''] | |
elif mtype == 'double-prefix': | |
s = '{} {} {}'.format(match.group( | |
2), match.group(3), match.group(4)) | |
author_entry = [s, match.group(1), ''] | |
elif mtype == 'name-prefix-name': | |
s = '{} {}'.format(match.group(2), match.group(3)) | |
author_entry = [s, match.group(1), ''] | |
elif mtype == 'name-name-prefix': | |
author_entry = [match.group(2), match.group(1), match.group(3)] | |
elif mtype == 'name-name': | |
author_entry = [match.group(2), match.group(1), ''] | |
else: | |
author_entry = [name, '', ''] | |
# search back in author_line for affiliation | |
author_entry = _add_affiliation( | |
author_line, enumaffils, author_entry, name) | |
author_list.append(author_entry) | |
return {'author_list': author_list, | |
'back_prop': back_propagate_affiliations_to} | |
def parse_author_affil_utf(authors: str) -> List: | |
""" | |
Call parse_author_affil() and do TeX to UTF conversion. | |
Output structure is the same but should be in UTF and not TeX. | |
""" | |
if not authors: | |
return [] | |
return list(map(lambda author: list(map(tex2utf, author)), | |
parse_author_affil(authors))) | |
def _remove_double_commas(items: List[str]) -> List[str]: | |
parts: List[str] = [] | |
last = '' | |
for pt in items: | |
if pt == ',' and last == ',': | |
continue | |
else: | |
parts.append(pt) | |
last = pt | |
return parts | |
def _tidy_name(name: str) -> str: | |
name = re.sub(r'\s\s+', ' ', name) # also gets rid of CR | |
# add space after dot (except in TeX) | |
name = re.sub(r'(?<!\\)\.(\S)', r'. \g<1>', name) | |
return name | |
def _collaboration_at_start(names: List[str]) \ | |
-> Tuple[List[str], List[List[str]], int]: | |
"""Perform special handling of collaboration at start.""" | |
author_list = [] | |
back_propagate_affiliations_to = 0 | |
while len(names) > 0: | |
m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))', | |
names[0], flags=re.IGNORECASE) | |
if not m: | |
break | |
# Add to author list | |
author_list.append([m.group(1), '', '']) | |
back_propagate_affiliations_to += 1 | |
# Remove from names | |
names.pop(0) | |
# Also swallow and following comma or colon | |
if names and (names[0] == ',' or names[0] == ':'): | |
names.pop(0) | |
return names, author_list, back_propagate_affiliations_to | |
def _enum_collaboration_at_end(author_line: str)->Dict: | |
"""Get separate set of enumerated affiliations from end of author_line.""" | |
# Now see if we have a separate set of enumerated affiliations | |
# This is indicated by finding '(\s*(' | |
line_m = re.search(r'\(\s*\((.*)$', author_line) | |
if not line_m: | |
return {} | |
enumaffils = {} | |
affils = re.sub(r'\s*\)\s*$', '', line_m.group(1)) | |
# Now expect to have '1) affil1 (2) affil2 (3) affil3' | |
for affil in affils.split('('): | |
# Now expect `1) affil1 ', discard if no match | |
m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil) | |
if m: | |
enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2)) | |
return enumaffils | |
def _add_affiliation(author_line: str, | |
enumaffils: Dict, | |
author_entry: List[str], | |
name: str) -> List: | |
""" | |
Add author affiliation to author_entry if one is found in author_line. | |
This should deal with these cases | |
Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2) | |
""" | |
en = re.escape(name) | |
namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*')) | |
m = re.search(namerex, author_line, flags=re.IGNORECASE) | |
if not m: | |
return author_entry | |
# Now see if we have enumerated references (just commas, digits, &, and) | |
affils = m.group(1).rstrip().lstrip() | |
affils = re.sub(r'(&|and)/,', ',', affils, flags=re.IGNORECASE) | |
if re.match(r'^[\d,\s]+$', affils): | |
for affil in affils.split(','): | |
if affil in enumaffils: | |
author_entry.append(enumaffils[affil]) | |
else: | |
author_entry.append(affils) | |
return author_entry | |
def _parse_author_affil_back_propagate(author_list: List[List[str]], | |
back_prop: int) -> List[List[str]]: | |
"""Back propagate author affiliation. | |
Take the author list structure generated by parse_author_affil_split(..) | |
and propagate affiliation information backwards to preceeding author | |
entries where none was give. Stop before entry $back_prop to avoid | |
adding affiliation information to collaboration names. | |
given, eg: | |
a.b.first, c.d.second (affil) | |
implies | |
a.b.first (affil), c.d.second (affil) | |
and in more complex cases: | |
a.b.first, c.d.second (1), e.f.third, g.h.forth (2,3) | |
implies | |
a.b.first (1), c.d.second (1), e.f.third (2,3), g.h.forth (2,3) | |
""" | |
last_affil: List[str] = [] | |
for x in range(len(author_list) - 1, max(back_prop - 1, -1), -1): | |
author_entry = author_list[x] | |
if len(author_entry) > 3: # author has affiliation,store | |
last_affil = author_entry | |
elif last_affil: | |
# author doesn't have affil but later one did => copy | |
author_entry.extend(last_affil[3:]) | |
return author_list | |
def split_authors(authors: str) -> List: | |
""" | |
Split author string into authors entity lists. | |
Take an author line as a string and return a reference to a list of the | |
different name and affiliation blocks. While this does normalize spacing | |
and 'and', it is a key feature that the set of strings returned can be | |
concatenated to reproduce the original authors line. This code thus | |
provides a very graceful degredation for badly formatted authors lines, as | |
the text at least shows up. | |
""" | |
# split authors field into blocks with boundaries of ( and ) | |
if not authors: | |
return [] | |
aus = re.split(r'(\(|\))', authors) | |
aus = list(filter(lambda x: x != '', aus)) | |
blocks = [] | |
if len(aus) == 1: | |
blocks.append(authors) | |
else: | |
c = '' | |
depth = 0 | |
for bit in aus: | |
if bit == '': | |
continue | |
if bit == '(': # track open parentheses | |
depth += 1 | |
if depth == 1: | |
blocks.append(c) | |
c = '(' | |
else: | |
c = c + bit | |
elif bit == ')': # track close parentheses | |
depth -= 1 | |
c = c + bit | |
if depth == 0: | |
blocks.append(c) | |
c = '' | |
else: # haven't closed, so keep accumulating | |
continue | |
else: | |
c = c + bit | |
if c: | |
blocks.append(c) | |
listx = [] | |
for block in blocks: | |
block = re.sub(r'\s+', ' ', block) | |
if re.match(r'^\(', block): # it is a comment | |
listx.append(block) | |
else: # it is a name | |
block = re.sub(r',?\s+(and|\&)\s', ',', block) | |
names = re.split(r'(,|:)\s*', block) | |
for name in names: | |
if not name: | |
continue | |
name = name.rstrip().lstrip() | |
if name: | |
listx.append(name) | |
# Recombine suffixes that were separated with a comma | |
parts: List[str] = [] | |
for p in listx: | |
if re.match(r'^(Jr\.?|Sr\.?\[IV]{2,})$', p) \ | |
and len(parts) >= 2 \ | |
and parts[-1] == ',' \ | |
and not re.match(r'\)$', parts[-2]): | |
separator = parts.pop() | |
last = parts.pop() | |
recomb = "{}{} {}".format(last, separator, p) | |
parts.append(recomb) | |
else: | |
parts.append(p) | |
return parts | |
def parse_authorline(authors: str) -> str: | |
""" | |
The external facing function from this module. Converts a complex authorline | |
into a simple one with only UTF-8. | |
Parameters | |
---------- | |
authors : string | |
The raw author line from the metadata | |
Returns | |
------- | |
clean_authors : string | |
String represeting cleaned author line | |
Examples | |
-------- | |
>>> parse_authorline('A. Losev, S. Shadrin, I. Shneiberg') | |
'Losev, A.; Shadrin, S.; Shneiberg, I.' | |
>>> parse_authorline("C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan") | |
'Balázs, C.; Berger, E. L.; Nadolsky, P. M.; Yuan, C. -P.' | |
>>> parse_authorline('Stephen C. Power (Lancaster University), Baruch Solel (Technion)') | |
'Power, Stephen C.; Solel, Baruch' | |
>>> parse_authorline("L. Scheck (1), H.-Th. Janka (1), T. Foglizzo (2), and K. Kifonidis (1)\n ((1) MPI for Astrophysics, Garching; (2) Service d'Astrophysique, CEA-Saclay)") | |
'Scheck, L.; Janka, H. -Th.; Foglizzo, T.; Kifonidis, K.' | |
""" | |
names = parse_author_affil_utf(authors) | |
return '; '.join([', '.join([q for q in n[:2] if q]) for n in names]) | |
def _parse_article_authors(article_author): | |
try: | |
return [article_author[0], parse_author_affil_utf(article_author[1])] | |
except Exception as e: | |
msg = "Author split failed for article {}".format(article_author[0]) | |
logger.error(msg) | |
logger.exception(e) | |
return [article_author[0], ''] | |
def parse_authorline_parallel(article_authors, n_processes=None): | |
""" | |
Parallelize `parse_authorline` | |
Parameters | |
---------- | |
article_authors : list | |
list of tuples (arXiv id, author strings from metadata) | |
(optional) | |
n_processes : int | |
number of processes | |
Returns | |
------- | |
authorsplit : list | |
list of author strings in standardized format | |
[ | |
[ author1_keyname, author1_firstnames, author1_suffix, affil1, | |
affil2 ] , | |
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] , | |
[ author3_keyname, author3_firstnames, author1_suffix ] | |
] | |
""" | |
logger.info( | |
'Parsing author lines for {} articles...'.format(len(article_authors)) | |
) | |
pool = Pool(n_processes) | |
parsed = pool.map(_parse_article_authors, article_authors) | |
outdict = {aid: auth for aid, auth in parsed} | |
filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz') | |
logger.info('Saving to {}'.format(filename)) | |
with gzip.open(filename, 'wb') as fout: | |
fout.write(json.dumps(outdict).encode('utf-8')) | |