File size: 12,989 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
s3_bulk_download.py

authors: Matt Bierbaum and Colin Clement
date: 2019-02-27

This module uses AWS to request a signed key url, which requests files
from the ArXiv S3 bucket. It then unpacks and converts the pdfs into text.

Note that at the time of writing the ArXiv manifest, it contains 1.15 TB
of PDFs, which would cost $103 to receive from AWS S3.

see: https://arxiv.org/help/bulk_data_s3

Usage
-----

Set DIR_FULLTEXT as the directory where the text parsed from pdfs should be placed.
Set DIR_PDFTARS as the directory where the raw pdf tars should be placed.

```
import arxiv_public_data.s3_bulk_download as s3

# Download manifest file (or load if already downloaded)
>>> manifest = s3.get_manifest()

# Download tar files and convert pdf to text
# Costs money! Will only download if it does not find files
>>> s3.process_manifest_files(manifest)

# If you just want to download the PDFs and not convert to text use
>>> s3.download_check_tarfiles(manifest)
```
"""

import os
import re
import gzip
import json
import glob
import shlex
import shutil
import tarfile
import boto3
import hashlib
import requests
import subprocess

from functools import partial
from multiprocessing import Pool
from collections import defaultdict
import xml.etree.ElementTree as ET

from arxiv_public_data import fulltext
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER

logger = LOGGER.getChild('s3')

CHUNK_SIZE = 2**20  # 1MB
BUCKET_NAME = 'arxiv'
S3_PDF_MANIFEST = 'pdf/arXiv_pdf_manifest.xml'
S3_TEX_MANIFEST = 'src/arXiv_src_manifest.xml'
HEADERS = {'x-amz-request-payer': 'requester'}

s3 = boto3.client('s3', region_name='us-east-1')

def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False,
                  dryrun=False):
    """
    Downloads filename from the ArXiv AWS S3 bucket, and returns streaming md5
    sum of the content
    Parameters
    ----------
        filename : str
            KEY corresponding to AWS bucket file
        outfile : stf
            name and path of local file in which downloaded file will be stored
        (optional)
        chunk_size : int
            requests byte streaming size (so 500MB are not stored in memory
            prior to processing)
        redownload : bool
            Look to see if file is already downloaded, and simply return md5sum
            if it it exists, unless redownload is True
        dryrun : bool
            If True, only log activity
    Returns
    -------
        md5sum : str
            md5 checksum of the contents of filename
    """
    if os.path.exists(outfile) and not redownload:
        md5 = hashlib.md5()
        md5.update(gzip.open(outfile, 'rb').read())
        return md5.hexdigest()

    md5 = hashlib.md5()
    url = s3.generate_presigned_url(
        "get_object",
        Params={
            "Bucket": BUCKET_NAME, "Key": filename, "RequestPayer": 'requester'
        }
    )
    if not dryrun:
        logger.info('Requesting "{}" (costs money!)'.format(filename))
        request = requests.get(url, stream=True)
        response_iter = request.iter_content(chunk_size=chunk_size)
        logger.info("\t Writing {}".format(outfile))
        with gzip.open(outfile, 'wb') as fout:
            for i, chunk in enumerate(response_iter):
                fout.write(chunk)
                md5.update(chunk)
    else:
        logger.info('Requesting "{}" (free!)'.format(filename))
        logger.info("\t Writing {}".format(outfile))
    return md5.hexdigest()

def default_manifest_filename():
    return os.path.join(DIR_PDFTARS, 'arxiv-manifest.xml.gz')

def get_manifest(filename=None, redownload=False):
    """
    Get the file manifest for the ArXiv
    Parameters
    ----------
        redownload : bool
            If true, forces redownload of manifest even if it exists
    Returns
    -------
        file_information : list of dicts
            each dict contains the file metadata
    """
    manifest_file = filename or default_manifest_filename()
    md5 = download_file(
        S3_PDF_MANIFEST, manifest_file, redownload=redownload, dryrun=False
    )
    manifest = gzip.open(manifest_file, 'rb').read()
    return parse_manifest(manifest)

def parse_manifest(manifest):
    """
    Parse the XML of the ArXiv manifest file.

    Parameters
    ----------
        manifest : str
            xml string from the ArXiv manifest file

    Returns
    -------
        file_information : list of dicts
            One dict for each file, containing the filename, size, md5sum,
            and other metadata
    """
    root = ET.fromstring(manifest)
    return [
        {c.tag: f.find(c.tag).text for c in f.getchildren()}
        for f in root.findall('file')
    ]

def _tar_to_filename(filename):
    return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'

def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=False):
    """ Download filename, check its md5sum, and form the output path """
    outname = _tar_to_filename(filename)
    md5_downloaded = download_file(
        filename, outname, dryrun=dryrun, redownload=redownload
    )

    if not dryrun:
        if md5_expected != md5_downloaded:
            msg = "MD5 '{}' does not match expected '{}' for file '{}'".format(
                md5_downloaded, md5_expected, filename
            )
            raise AssertionError(msg)

    return outname

def download_check_tarfiles(list_of_fileinfo, dryrun=False):
    """
    Download tar files from the ArXiv manifest and check that their MD5sums
    match

    Parameters
    ----------
        list_of_fileinfo : list
            Some elements of results of get_manifest
        (optional)
        dryrun : bool
            If True, only log activity
    """
    for fileinfo in list_of_fileinfo:
        download_check_tarfile(fileinfo['filename'], fileinfo['md5sum'], dryrun=dryrun)

def call(cmd, dryrun=False, debug=False):
    """ Spawn a subprocess and execute the string in cmd """
    if dryrun:
        logger.info(cmd)
        return 0
    else:
        return subprocess.check_call(
            shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
        )

def _make_pathname(filename):
    """
    Make filename path for text document, sorted like on arXiv servers.
    Parameters
    ----------
        filename : str
            string filename of arXiv article
        (optional)
    Returns
    -------
        pathname : str
            pathname in which to store the article following
            * Old ArXiv IDs: e.g. hep-ph0001001.txt returns
                DIR_PDFTARS/hep-ph/0001/hep-ph0001001.txt
            * New ArXiv IDs: e.g. 1501.13851.txt returns
                DIR_PDFTARS/arxiv/1501/1501.13851.txt
    """
    basename = os.path.basename(filename)
    fname = os.path.splitext(basename)[0]
    if '.' in fname:  # new style ArXiv ID
        yearmonth = fname.split('.')[0]
        return os.path.join(DIR_FULLTEXT, 'arxiv', yearmonth, basename)
    # old style ArXiv ID
    cat, aid = re.split(r'(\d+)', fname)[:2]
    yearmonth = aid[:4]
    return os.path.join(DIR_FULLTEXT, cat, yearmonth, basename)

def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
                          timelimit=fulltext.TIMELIMIT):
    outname = _tar_to_filename(filename)

    if not os.path.exists(outname):
        msg = 'Tarfile from manifest not found {}, skipping...'.format(outname)
        logger.error(msg)
        return

    # unpack tar file
    if pdfnames:
        namelist = ' '.join(pdfnames)
        cmd = 'tar --one-top-level -C {} -xf {} {}'
        cmd = cmd.format(DIR_PDFTARS, outname, namelist)
    else:
        cmd = 'tar --one-top-level -C {} -xf {}'.format(DIR_PDFTARS, outname)
    _call(cmd, dryrun)

    basename = os.path.splitext(os.path.basename(filename))[0]
    pdfdir = os.path.join(DIR_PDFTARS, basename, basename.split('_')[2])

    # Run fulltext to convert pdfs in tardir into *.txt
    converts = fulltext.convert_directory_parallel(
        pdfdir, processes=processes, timelimit=timelimit
    )

    # move txt into final file structure
    txtfiles = glob.glob('{}/*.txt'.format(pdfdir))
    for tf in txtfiles:
        mvfn = _make_pathname(tf)
        dirname = os.path.dirname(mvfn)
        if not os.path.exists(dirname):
            _call('mkdir -p {}'.format(dirname), dryrun)

        if not dryrun:
            shutil.move(tf, mvfn)

    # clean up pdfs
    _call('rm -rf {}'.format(os.path.join(DIR_PDFTARS, basename)), dryrun)

def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processes=1):
    """
    Download and process one of the tar files from the ArXiv manifest.
    Download, unpack, and spawn the Docker image for converting pdf2text.
    It will only try to download the file if it does not already exist.

    The tar file will be stored in DIR_FULLTEXT/<fileinfo[filename](tar)> and the
    resulting arXiv articles will be stored in the subdirectory
    DIR_FULLTEXT/arxiv/<yearmonth>/<aid>.txt for old style arXiv IDs and
    DIR_FULLTEXT/<category>/<yearmonth>/<aid>.txt for new style arXiv IDs.

    Parameters
    ----------
        fileinfo : dict
            dictionary of file information from parse_manifest
        (optional)
        dryrun : bool
            If True, only log activity
        debug : bool
            Silence stderr of Docker _call if debug is False
    """
    filename = fileinfo['filename']
    md5sum = fileinfo['md5sum']

    if check_if_any_processed(fileinfo):
        logger.info('Tar file appears processed, skipping {}...'.format(filename))
        return

    logger.info('Processing tar "{}" ...'.format(filename))
    process_tarfile_inner(filename, pdfnames=None, processes=processes, dryrun=dryrun)

def process_manifest_files(list_of_fileinfo, processes=1, dryrun=False):
    """
    Download PDFs from the ArXiv AWS S3 bucket and convert each pdf to text
    Parameters. If files are already downloaded, it will only process them.
    ----------
        list_of_fileinfo : list
            Some elements of results of get_manifest
        (optional)
        processes : int
            number of paralell workers to spawn (roughly as many CPUs as you have)
        dryrun : bool
            If True, only log activity
    """
    for fileinfo in list_of_fileinfo:
        process_tarfile(fileinfo, dryrun=dryrun, processes=processes)

def check_if_any_processed(fileinfo):
    """
    Spot check a tarfile to see if the pdfs have been converted to text,
    given an element of the s3 manifest
    """
    first = _make_pathname(fileinfo['first_item']+'.txt')
    last = _make_pathname(fileinfo['last_item']+'.txt')
    return os.path.exists(first) and os.path.exists(last)

def generate_tarfile_indices(manifest):
    """
    Go through the manifest and for every tarfile, get a list of the PDFs
    that should be contained within it. This is a separate function because
    even checking the tars is rather slow.

    Returns
    -------
    index : dictionary
        keys: tarfile, values: list of pdfs
    """
    index = {}

    for fileinfo in manifest:
        name = fileinfo['filename']
        logger.info("Indexing {}...".format(name))

        tarname = os.path.join(DIR_PDFTARS, os.path.basename(name))+'.gz'
        files = [i for i in tarfile.open(tarname).getnames() if i.endswith('.pdf')]

        index[name] = files
    return index

def check_missing_txt_files(index):
    """
    Use the index file from `generate_tarfile_indices` to check which pdf->txt
    conversions are outstanding.
    """
    missing = defaultdict(list)
    for tar, pdflist in index.items():
        logger.info("Checking {}...".format(tar))
        for pdf in pdflist:
            txt = _make_pathname(pdf).replace('.pdf', '.txt')

            if not os.path.exists(txt):
                missing[tar].append(pdf)

    return missing

def rerun_missing(missing, processes=1):
    """
    Use the output of `check_missing_txt_files` to attempt to rerun the text
    files which are missing from the conversion. There are various reasons
    that they can fail.
    """
    sort = list(reversed(
        sorted([(k, v) for k, v in missing.items()], key=lambda x: len(x[1]))
    ))

    for tar, names in sort:
        logger.info("Running {} ({} to do)...".format(tar, len(names)))
        process_tarfile_inner(
            tar, pdfnames=names, processes=processes,
            timelimit=5 * fulltext.TIMELIMIT
        )

def process_missing(manifest, processes=1):
    """
    Do the full process of figuring what is missing and running them
    """
    indexfile = os.path.join(DIR_PDFTARS, 'manifest-index.json')

    if not os.path.exists(indexfile):
        index = generate_tarfile_indices(manifest)
        json.dump(index, open(indexfile, 'w'))

    index = json.load(open(indexfile))
    missing = check_missing_txt_files(index)
    rerun_missing(missing, processes=processes)