File size: 4,907 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
util.py

author: Colin Clement
date: 2019-04-05

This module contains helper functions for loading embeddings and batch
loading the full text, since many computers cannot contain the whole
fulltext in memory.
"""

import os
import re
import numpy as np
import pickle

from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT
from arxiv_public_data.oai_metadata import load_metadata

def id_to_pathname(aid):
    """ 
    Make filename path for text document, matching the format of fulltext 
    creation in `s3_bulk_download`
    Parameters
    ----------
        aid : str
            string of arXiv article id as found in metadata
    Returns
    -------
        pathname : str
            pathname in which to store the article following
    Examples
    --------
    >>> id_to_pathname('hep-ph/0001001')  #doctest: +ELLIPSIS
    '.../hep-ph/0001/hep-ph0001001.txt'

    >>> id_to_pathname('1501.13851')  #doctest: +ELLIPSIS
    '.../arxiv/1501/1501.13851.txt'
    """
    if '.' in aid:  # new style ArXiv ID
        yymm = aid.split('.')[0]
        return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')

    # old style ArXiv ID
    cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
    yymm = arxiv_id[:4]
    return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt')

def load_generator(paths, batchsize):
    """
    Creates a generator object for batch loading files from paths
    Parameters
    ----------
        paths : list of filepaths
        batchsize : int
    Returns
    -------
        file_contents : list of strings of contents of files in path
    """
    assert type(paths) is list, 'Requires a list of paths'
    assert type(batchsize) is int, 'batchsize must be an int'
    assert batchsize > 0, 'batchsize must be positive'

    out = []
    for p in paths:
        with open(p, 'r') as fin:
            out.append(fin.read())
        if len(out) == batchsize:
            yield np.array(out, dtype='object')
            out = []
    yield out

def batch_fulltext(batchsize=32, maxnum=None):
    """
    Read metadata and find corresponding files in the fulltext
    Parameters
    ----------
        (optional)
        batchsize : int
            number of fulltext files to load into a batch
        maxnum : int
            the maximum number of paths to feed the generator, for
            testing purposes
    Returns
    -------
        md_index, all_ids, load_gen : tuple of (list, list, generator)
           md_index is a mapping of existing fulltext files, in order
           of their appearance, and containing the index of corresponding
           metadata. all_ids is a list of all arXiv IDs in the metadata.
           load_gen is a generator which allows batched loading of the
           full-text, as defined by `load_generator`
    """
    all_ids = [m['id'] for m in load_metadata()]
    all_paths = [id_to_pathname(aid) for aid in all_ids]
    exists = [os.path.exists(p) for p in all_paths]
    existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum]
    md_index = [i for i, e in enumerate(exists) if e] 
    return md_index, all_ids, load_generator(existing_paths, batchsize)

def load_embeddings(filename, headers=0):
    """
    Loads vector embeddings
    Parameters
    ----------
        filename : str
            path to vector embeddings saved by `create_save_embeddings`
        (optional)
        headers : int
            number of pickle calls containing metadata separate from the graphs
    Returns
    -------
        embeddings : dict
            keys 'embeddings' containing vector embeddings and
            'headers' containining metadata
    """
    out = {'embeddings': [], 'headers': []}
    N = 0
    with open(filename, 'rb') as fin:
        while True:
            try:
                if N < headers:
                    out['headers'].append(pickle.load(fin))
                else:
                    out['embeddings'].extend(pickle.load(fin))
            except EOFError as e:
                break 
            N += 1
    out['embeddings'] = np.array(out['embeddings'])
    return out

def fill_zeros(loaded_embedding):
    """
    Fill out zeros in the full-text embedding where full-text is missing
    Parameters
    ----------
        loaded_embedding : dict
            dict as saved from with `load_embeddings` with 2 headers
            of the list of the metadata_index each embedding vector corresponds
            to, the list of all article ids
    Returns
    -------
        embeddings : array_like
            vector embeddings of shape (number of articles, embedding dimension)
    """
    md_index = loaded_embedding['headers'][0]
    all_ids = loaded_embedding['headers'][1]
    vectors = loaded_embedding['embeddings']
    output = np.zeros((len(all_ids), vectors.shape[1]))
    for idx, v in zip(md_index, vectors):
        output[idx,:] = v
    return output