File size: 3,486 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /usr/bin/env python
import time
import re
import sys
import glob
import os
import gzip
import json
import math
from multiprocessing import Pool,cpu_count

from arxiv_public_data.regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT, LOGGER

log = LOGGER.getChild('fulltext')
RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE)
RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)")


def path_to_id(path):
    """ Convert filepath name of ArXiv file to ArXiv ID """
    name = os.path.splitext(os.path.basename(path))[0]
    if '.' in name:  # new  ID
        return name
    split = [a for a in RE_OLDNAME_SPLIT.split(name) if a]
    return "/".join(split)


def all_articles(directory=DIR_FULLTEXT):
    """ Find all *.txt files in directory """
    out = []
    # make sure the path is absolute for os.walk
    directory = os.path.abspath(os.path.expanduser(directory))

    for root, dirs, files in os.walk(directory):
        for f in files:
            if 'txt' in f:
                out.append(os.path.join(root, f))

    return out

def extract_references(filename, pattern=RE_FLEX):
    """
    Parameters
    ----------
        filename : str
            name of file to search for pattern
        pattern : re pattern object
            compiled regex pattern

    Returns
    -------
        citations : list
            list of found arXiv IDs
    """
    out = []
    with open(filename, 'r') as fn:
        txt = fn.read()

        for matches in pattern.findall(txt):
            out.extend([clean(a) for a in matches if a])
    return list(set(out))

def citation_list_inner(articles):
    """ Find references in all the input articles
    Parameters
    ----------
        articles : list of str
            list of paths to article text
    Returns
    -------
        citations : dict[arXiv ID] = list of arXiv IDs
            dictionary of articles and their references
    """
    cites = {}
    for i, article in enumerate(articles):
        if i > 0 and i % 1000 == 0:
            log.info('Completed {} articles'.format(i))
        try:
            refs = extract_references(article)
            cites[path_to_id(article)] = refs
        except:
            log.error("Error in {}".format(article))
            continue
    return cites


def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT):
    """
    Split the task of checking for citations across some number of processes
    Parameters
    ----------
        N : int
            number of processes
        directory: str
            directory where full text files are stored
    Returns
    -------
        citations : dict[arXiv ID] = list of arXiv IDs
            all arXiv citations in all articles
    """
    articles = all_articles(directory)
    log.info('Calculating citation network for {} articles'.format(len(articles)))

    pool = Pool(N)

    A = len(articles)
    divs = list(range(0, A, math.ceil(A/N))) + [A]
    chunks = [articles[s:e] for s, e in zip(divs[:-1], divs[1:])]

    cites = pool.map(citation_list_inner, chunks)

    allcites = {}
    for c in cites:
        allcites.update(c)
    return allcites


def default_filename():
    return os.path.join(DIR_OUTPUT, 'internal-citations.json.gz')


def save_to_default_location(citations):
    filename = default_filename()

    log.info('Saving to "{}"'.format(filename))
    with gzip.open(filename, 'wb') as fn:
        fn.write(json.dumps(citations).encode('utf-8'))