File size: 8,951 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""
oia_metadata.py

authors: Matt Bierbaum and Colin Clement
date: 2019-02-25

This module interacts with the Open Archive Initiative API, downloading
the metadata for all Arxiv articles.

Usage
=====

python oia_metadata.py data/<savefile>.json

Notes
=====
The save file is not technically JSON, but individual streamed lines of JSON,
each of which is compressed by gzip. Use the helper function load_metadata
to be sure to open it without error.

Resources
=========
* http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm
* https://arxiv.org/help/oa/index
"""

import os
import gzip
import glob
import json
import time
import hashlib
import datetime
import requests
import xml.etree.ElementTree as ET

from arxiv_public_data.config import LOGGER, DIR_BASE

log = LOGGER.getChild('metadata')

URL_ARXIV_OAI = 'https://export.arxiv.org/oai2'
URL_CITESEER_OAI = 'http://citeseerx.ist.psu.edu/oai2'
OAI_XML_NAMESPACES = {
    'OAI': 'http://www.openarchives.org/OAI/2.0/',
    'arXiv': 'http://arxiv.org/OAI/arXivRaw/'
}

def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI,
                          metadataPrefix='arXivRaw'):
    """
    Query OIA API for the metadata of 1000 Arxiv article

    Parameters
    ----------
        resumptionToken : str
            Token for the API which triggers the next 1000 articles

    Returns
    -------
        record_chunks : str
            metadata of 1000 arXiv articles as an XML string
    """
    parameters = {'verb': 'ListRecords'}

    if resumptionToken:
        parameters['resumptionToken'] = resumptionToken
    else:
        parameters['metadataPrefix'] = metadataPrefix

    response = requests.get(harvest_url, params=parameters)

    if response.status_code == 200:
        return response.text

    if response.status_code == 503:
        secs = int(response.headers.get('Retry-After', 20)) * 1.5
        log.info('Requested to wait, waiting {} seconds until retry...'.format(secs))

        time.sleep(secs)
        return get_list_record_chunk(resumptionToken=resumptionToken)
    else:
        raise Exception(
            'Unknown error in HTTP request {}, status code: {}'.format(
                response.url, response.status_code
            )
        )

def _record_element_text(elm, name):
    """ XML helper function for extracting text from leaf (single-node) elements """
    item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
    return item.text if item is not None else None

def _record_element_all(elm, name):
    """ XML helper function for extracting text from queries with multiple nodes """
    return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES)

def parse_record(elm):
    """
    Parse the XML element of a single ArXiv article into a dictionary of
    attributes

    Parameters
    ----------
        elm : xml.etree.ElementTree.Element
            Element of the record of a single ArXiv article

    Returns
    -------
        output : dict
            Attributes of the ArXiv article stored as a dict with the keys
            id, submitter, authors, title, comments, journal-ref, doi, abstract,
            report-no, categories, and version
    """
    text_keys = [
        'id', 'submitter', 'authors', 'title', 'comments',
        'journal-ref', 'doi', 'abstract', 'report-no'
    ]
    output = {key: _record_element_text(elm, key) for key in text_keys}
    output['categories'] = [
        i.text for i in (_record_element_all(elm, 'categories') or [])
    ]
    output['versions'] = [
        i.attrib['version'] for i in _record_element_all(elm, 'version')
    ]
    return output

def parse_xml_listrecords(root):
    """
    Parse XML of one chunk of the metadata of 1000 ArXiv articles
    into a list of dictionaries

    Parameters
    ----------
        root : xml.etree.ElementTree.Element
            Element containing the records of an entire chunk of ArXiv queries

    Returns
    -------
        records, resumptionToken : list, str
            records is a list of 1000 dictionaries, each containing the
            attributes of a single arxiv article
            resumptionToken is a string which is fed into the subsequent query
    """
    resumptionToken = root.find(
        'OAI:ListRecords/OAI:resumptionToken',
        OAI_XML_NAMESPACES
    )
    resumptionToken = resumptionToken.text if resumptionToken is not None else ''

    records = root.findall(
        'OAI:ListRecords/OAI:record/OAI:metadata/arXiv:arXivRaw',
        OAI_XML_NAMESPACES
    )
    records = [parse_record(p) for p in records]

    return records, resumptionToken

def check_xml_errors(root):
    """ Check for, log, and raise any OAI service errors in the XML """
    error = root.find('OAI:error', OAI_XML_NAMESPACES)

    if error is not None:
        raise RuntimeError(
            'OAI service returned error: {}'.format(error.text)
        )

def find_default_locations():
    outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz')
    resume = os.path.join(
        DIR_BASE, 'arxiv-metadata-oai-*.json.gz-resumptionToken.txt'
    )
    fn_outfile = sorted(glob.glob(outfile))
    fn_resume = sorted(glob.glob(resume))

    if len(fn_outfile) > 0:
        return fn_outfile[-1]
    return None

def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
    """
    Download the metadata for every article in the ArXiv via the OAI API

    Parameters
    ----------
        outfile : str (default './arxiv-metadata-oai-<date>.json')
            name of file where data is stored, appending each chunk of 1000
            articles.
        resumptionToken : str (default None)
            token which instructs the OAI server to continue feeding the next
            chunk
        autoresume : bool
            If true, it looks for a saved resumptionToken in the file
            <outfile>-resumptionToken.txt
    """
    date = str(datetime.datetime.now()).split(' ')[0]

    outfile = (
        outfile or # user-supplied
        find_default_locations() or # already in progress 
        os.path.join(
            DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date)
        ) # new file
    )

    directory = os.path.split(outfile)[0]
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
    tokenfile = '{}-resumptionToken.txt'.format(outfile)
    chunk_index = 0
    total_records = 0

    log.info('Saving metadata to "{}"'.format(outfile))

    resumptionToken = None
    if autoresume:
        try:
            resumptionToken = open(tokenfile, 'r').read()
        except Exception as e:
            log.warn("No tokenfile found '{}'".format(tokenfile))
            log.info("Starting download from scratch...")

    while True:
        log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format(
            chunk_index, total_records, resumptionToken)
        )
        xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
        check_xml_errors(xml_root)
        records, resumptionToken = parse_xml_listrecords(xml_root)

        chunk_index = chunk_index + 1
        total_records = total_records + len(records)

        with gzip.open(outfile, 'at', encoding='utf-8') as fout:
            for rec in records:
                fout.write(json.dumps(rec) + '\n')
        if resumptionToken:
            with open(tokenfile, 'w') as fout:
                fout.write(resumptionToken)
        else:
            log.info('No resumption token, query finished')
            return

        time.sleep(12)  # OAI server usually requires a 10s wait

def load_metadata(infile=None):
    """
    Load metadata saved by all_of_arxiv, as a list of lines of gzip compressed
    json.

    Parameters
    ----------
        infile : str or None
            name of file saved by gzip. If None, one is attempted to be found
            in the expected location with the expected name.

    Returns
    -------
        article_attributes : list
            list of dicts, each of which contains the metadata attributes of
            the ArXiv articles
    """
    fname = infile or find_default_locations()
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        return [json.loads(line) for line in fin.readlines()]

def hash_abstracts(metadata):
    """ Replace abstracts with their MD5 hash for legal distribution """
    metadata_no_abstract = []
    for i in range(len(metadata)):
        m = metadata[i].copy()
        m['abstract_md5'] = hashlib.md5(m['abstract'].encode()).hexdigest()
        del m['abstract']
        metadata_no_abstract.append(m)
    return metadata_no_abstract

def validate_abstract_hashes(metadata, metadata_no_abstract):
    """ Validate that abstracts match the hashes """
    for m, n in zip(metadata, metadata_no_abstract):
        md5 = hashlib.md5(m['abstract'].encode()).hexdigest()
        if not md5 == n['abstract_md5']:
            return False
    return True