Researcher / arxiv_public_data /internal_citations.py
sidphbot's picture
spaces init
a8d4e3d
raw history blame
No virus
3.49 kB
#! /usr/bin/env python
import time
import re
import sys
import glob
import os
import gzip
import json
import math
from multiprocessing import Pool,cpu_count
from arxiv_public_data.regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT, LOGGER
log = LOGGER.getChild('fulltext')
RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE)
RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)")
def path_to_id(path):
""" Convert filepath name of ArXiv file to ArXiv ID """
name = os.path.splitext(os.path.basename(path))[0]
if '.' in name: # new ID
return name
split = [a for a in RE_OLDNAME_SPLIT.split(name) if a]
return "/".join(split)
def all_articles(directory=DIR_FULLTEXT):
""" Find all *.txt files in directory """
out = []
# make sure the path is absolute for os.walk
directory = os.path.abspath(os.path.expanduser(directory))
for root, dirs, files in os.walk(directory):
for f in files:
if 'txt' in f:
out.append(os.path.join(root, f))
return out
def extract_references(filename, pattern=RE_FLEX):
"""
Parameters
----------
filename : str
name of file to search for pattern
pattern : re pattern object
compiled regex pattern
Returns
-------
citations : list
list of found arXiv IDs
"""
out = []
with open(filename, 'r') as fn:
txt = fn.read()
for matches in pattern.findall(txt):
out.extend([clean(a) for a in matches if a])
return list(set(out))
def citation_list_inner(articles):
""" Find references in all the input articles
Parameters
----------
articles : list of str
list of paths to article text
Returns
-------
citations : dict[arXiv ID] = list of arXiv IDs
dictionary of articles and their references
"""
cites = {}
for i, article in enumerate(articles):
if i > 0 and i % 1000 == 0:
log.info('Completed {} articles'.format(i))
try:
refs = extract_references(article)
cites[path_to_id(article)] = refs
except:
log.error("Error in {}".format(article))
continue
return cites
def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT):
"""
Split the task of checking for citations across some number of processes
Parameters
----------
N : int
number of processes
directory: str
directory where full text files are stored
Returns
-------
citations : dict[arXiv ID] = list of arXiv IDs
all arXiv citations in all articles
"""
articles = all_articles(directory)
log.info('Calculating citation network for {} articles'.format(len(articles)))
pool = Pool(N)
A = len(articles)
divs = list(range(0, A, math.ceil(A/N))) + [A]
chunks = [articles[s:e] for s, e in zip(divs[:-1], divs[1:])]
cites = pool.map(citation_list_inner, chunks)
allcites = {}
for c in cites:
allcites.update(c)
return allcites
def default_filename():
return os.path.join(DIR_OUTPUT, 'internal-citations.json.gz')
def save_to_default_location(citations):
filename = default_filename()
log.info('Saving to "{}"'.format(filename))
with gzip.open(filename, 'wb') as fn:
fn.write(json.dumps(citations).encode('utf-8'))