Spaces:
Build error
Build error
File size: 12,989 Bytes
a8d4e3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 |
"""
s3_bulk_download.py
authors: Matt Bierbaum and Colin Clement
date: 2019-02-27
This module uses AWS to request a signed key url, which requests files
from the ArXiv S3 bucket. It then unpacks and converts the pdfs into text.
Note that at the time of writing the ArXiv manifest, it contains 1.15 TB
of PDFs, which would cost $103 to receive from AWS S3.
see: https://arxiv.org/help/bulk_data_s3
Usage
-----
Set DIR_FULLTEXT as the directory where the text parsed from pdfs should be placed.
Set DIR_PDFTARS as the directory where the raw pdf tars should be placed.
```
import arxiv_public_data.s3_bulk_download as s3
# Download manifest file (or load if already downloaded)
>>> manifest = s3.get_manifest()
# Download tar files and convert pdf to text
# Costs money! Will only download if it does not find files
>>> s3.process_manifest_files(manifest)
# If you just want to download the PDFs and not convert to text use
>>> s3.download_check_tarfiles(manifest)
```
"""
import os
import re
import gzip
import json
import glob
import shlex
import shutil
import tarfile
import boto3
import hashlib
import requests
import subprocess
from functools import partial
from multiprocessing import Pool
from collections import defaultdict
import xml.etree.ElementTree as ET
from arxiv_public_data import fulltext
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER
logger = LOGGER.getChild('s3')
CHUNK_SIZE = 2**20 # 1MB
BUCKET_NAME = 'arxiv'
S3_PDF_MANIFEST = 'pdf/arXiv_pdf_manifest.xml'
S3_TEX_MANIFEST = 'src/arXiv_src_manifest.xml'
HEADERS = {'x-amz-request-payer': 'requester'}
s3 = boto3.client('s3', region_name='us-east-1')
def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False,
dryrun=False):
"""
Downloads filename from the ArXiv AWS S3 bucket, and returns streaming md5
sum of the content
Parameters
----------
filename : str
KEY corresponding to AWS bucket file
outfile : stf
name and path of local file in which downloaded file will be stored
(optional)
chunk_size : int
requests byte streaming size (so 500MB are not stored in memory
prior to processing)
redownload : bool
Look to see if file is already downloaded, and simply return md5sum
if it it exists, unless redownload is True
dryrun : bool
If True, only log activity
Returns
-------
md5sum : str
md5 checksum of the contents of filename
"""
if os.path.exists(outfile) and not redownload:
md5 = hashlib.md5()
md5.update(gzip.open(outfile, 'rb').read())
return md5.hexdigest()
md5 = hashlib.md5()
url = s3.generate_presigned_url(
"get_object",
Params={
"Bucket": BUCKET_NAME, "Key": filename, "RequestPayer": 'requester'
}
)
if not dryrun:
logger.info('Requesting "{}" (costs money!)'.format(filename))
request = requests.get(url, stream=True)
response_iter = request.iter_content(chunk_size=chunk_size)
logger.info("\t Writing {}".format(outfile))
with gzip.open(outfile, 'wb') as fout:
for i, chunk in enumerate(response_iter):
fout.write(chunk)
md5.update(chunk)
else:
logger.info('Requesting "{}" (free!)'.format(filename))
logger.info("\t Writing {}".format(outfile))
return md5.hexdigest()
def default_manifest_filename():
return os.path.join(DIR_PDFTARS, 'arxiv-manifest.xml.gz')
def get_manifest(filename=None, redownload=False):
"""
Get the file manifest for the ArXiv
Parameters
----------
redownload : bool
If true, forces redownload of manifest even if it exists
Returns
-------
file_information : list of dicts
each dict contains the file metadata
"""
manifest_file = filename or default_manifest_filename()
md5 = download_file(
S3_PDF_MANIFEST, manifest_file, redownload=redownload, dryrun=False
)
manifest = gzip.open(manifest_file, 'rb').read()
return parse_manifest(manifest)
def parse_manifest(manifest):
"""
Parse the XML of the ArXiv manifest file.
Parameters
----------
manifest : str
xml string from the ArXiv manifest file
Returns
-------
file_information : list of dicts
One dict for each file, containing the filename, size, md5sum,
and other metadata
"""
root = ET.fromstring(manifest)
return [
{c.tag: f.find(c.tag).text for c in f.getchildren()}
for f in root.findall('file')
]
def _tar_to_filename(filename):
return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'
def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=False):
""" Download filename, check its md5sum, and form the output path """
outname = _tar_to_filename(filename)
md5_downloaded = download_file(
filename, outname, dryrun=dryrun, redownload=redownload
)
if not dryrun:
if md5_expected != md5_downloaded:
msg = "MD5 '{}' does not match expected '{}' for file '{}'".format(
md5_downloaded, md5_expected, filename
)
raise AssertionError(msg)
return outname
def download_check_tarfiles(list_of_fileinfo, dryrun=False):
"""
Download tar files from the ArXiv manifest and check that their MD5sums
match
Parameters
----------
list_of_fileinfo : list
Some elements of results of get_manifest
(optional)
dryrun : bool
If True, only log activity
"""
for fileinfo in list_of_fileinfo:
download_check_tarfile(fileinfo['filename'], fileinfo['md5sum'], dryrun=dryrun)
def call(cmd, dryrun=False, debug=False):
""" Spawn a subprocess and execute the string in cmd """
if dryrun:
logger.info(cmd)
return 0
else:
return subprocess.check_call(
shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
)
def _make_pathname(filename):
"""
Make filename path for text document, sorted like on arXiv servers.
Parameters
----------
filename : str
string filename of arXiv article
(optional)
Returns
-------
pathname : str
pathname in which to store the article following
* Old ArXiv IDs: e.g. hep-ph0001001.txt returns
DIR_PDFTARS/hep-ph/0001/hep-ph0001001.txt
* New ArXiv IDs: e.g. 1501.13851.txt returns
DIR_PDFTARS/arxiv/1501/1501.13851.txt
"""
basename = os.path.basename(filename)
fname = os.path.splitext(basename)[0]
if '.' in fname: # new style ArXiv ID
yearmonth = fname.split('.')[0]
return os.path.join(DIR_FULLTEXT, 'arxiv', yearmonth, basename)
# old style ArXiv ID
cat, aid = re.split(r'(\d+)', fname)[:2]
yearmonth = aid[:4]
return os.path.join(DIR_FULLTEXT, cat, yearmonth, basename)
def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
timelimit=fulltext.TIMELIMIT):
outname = _tar_to_filename(filename)
if not os.path.exists(outname):
msg = 'Tarfile from manifest not found {}, skipping...'.format(outname)
logger.error(msg)
return
# unpack tar file
if pdfnames:
namelist = ' '.join(pdfnames)
cmd = 'tar --one-top-level -C {} -xf {} {}'
cmd = cmd.format(DIR_PDFTARS, outname, namelist)
else:
cmd = 'tar --one-top-level -C {} -xf {}'.format(DIR_PDFTARS, outname)
_call(cmd, dryrun)
basename = os.path.splitext(os.path.basename(filename))[0]
pdfdir = os.path.join(DIR_PDFTARS, basename, basename.split('_')[2])
# Run fulltext to convert pdfs in tardir into *.txt
converts = fulltext.convert_directory_parallel(
pdfdir, processes=processes, timelimit=timelimit
)
# move txt into final file structure
txtfiles = glob.glob('{}/*.txt'.format(pdfdir))
for tf in txtfiles:
mvfn = _make_pathname(tf)
dirname = os.path.dirname(mvfn)
if not os.path.exists(dirname):
_call('mkdir -p {}'.format(dirname), dryrun)
if not dryrun:
shutil.move(tf, mvfn)
# clean up pdfs
_call('rm -rf {}'.format(os.path.join(DIR_PDFTARS, basename)), dryrun)
def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processes=1):
"""
Download and process one of the tar files from the ArXiv manifest.
Download, unpack, and spawn the Docker image for converting pdf2text.
It will only try to download the file if it does not already exist.
The tar file will be stored in DIR_FULLTEXT/<fileinfo[filename](tar)> and the
resulting arXiv articles will be stored in the subdirectory
DIR_FULLTEXT/arxiv/<yearmonth>/<aid>.txt for old style arXiv IDs and
DIR_FULLTEXT/<category>/<yearmonth>/<aid>.txt for new style arXiv IDs.
Parameters
----------
fileinfo : dict
dictionary of file information from parse_manifest
(optional)
dryrun : bool
If True, only log activity
debug : bool
Silence stderr of Docker _call if debug is False
"""
filename = fileinfo['filename']
md5sum = fileinfo['md5sum']
if check_if_any_processed(fileinfo):
logger.info('Tar file appears processed, skipping {}...'.format(filename))
return
logger.info('Processing tar "{}" ...'.format(filename))
process_tarfile_inner(filename, pdfnames=None, processes=processes, dryrun=dryrun)
def process_manifest_files(list_of_fileinfo, processes=1, dryrun=False):
"""
Download PDFs from the ArXiv AWS S3 bucket and convert each pdf to text
Parameters. If files are already downloaded, it will only process them.
----------
list_of_fileinfo : list
Some elements of results of get_manifest
(optional)
processes : int
number of paralell workers to spawn (roughly as many CPUs as you have)
dryrun : bool
If True, only log activity
"""
for fileinfo in list_of_fileinfo:
process_tarfile(fileinfo, dryrun=dryrun, processes=processes)
def check_if_any_processed(fileinfo):
"""
Spot check a tarfile to see if the pdfs have been converted to text,
given an element of the s3 manifest
"""
first = _make_pathname(fileinfo['first_item']+'.txt')
last = _make_pathname(fileinfo['last_item']+'.txt')
return os.path.exists(first) and os.path.exists(last)
def generate_tarfile_indices(manifest):
"""
Go through the manifest and for every tarfile, get a list of the PDFs
that should be contained within it. This is a separate function because
even checking the tars is rather slow.
Returns
-------
index : dictionary
keys: tarfile, values: list of pdfs
"""
index = {}
for fileinfo in manifest:
name = fileinfo['filename']
logger.info("Indexing {}...".format(name))
tarname = os.path.join(DIR_PDFTARS, os.path.basename(name))+'.gz'
files = [i for i in tarfile.open(tarname).getnames() if i.endswith('.pdf')]
index[name] = files
return index
def check_missing_txt_files(index):
"""
Use the index file from `generate_tarfile_indices` to check which pdf->txt
conversions are outstanding.
"""
missing = defaultdict(list)
for tar, pdflist in index.items():
logger.info("Checking {}...".format(tar))
for pdf in pdflist:
txt = _make_pathname(pdf).replace('.pdf', '.txt')
if not os.path.exists(txt):
missing[tar].append(pdf)
return missing
def rerun_missing(missing, processes=1):
"""
Use the output of `check_missing_txt_files` to attempt to rerun the text
files which are missing from the conversion. There are various reasons
that they can fail.
"""
sort = list(reversed(
sorted([(k, v) for k, v in missing.items()], key=lambda x: len(x[1]))
))
for tar, names in sort:
logger.info("Running {} ({} to do)...".format(tar, len(names)))
process_tarfile_inner(
tar, pdfnames=names, processes=processes,
timelimit=5 * fulltext.TIMELIMIT
)
def process_missing(manifest, processes=1):
"""
Do the full process of figuring what is missing and running them
"""
indexfile = os.path.join(DIR_PDFTARS, 'manifest-index.json')
if not os.path.exists(indexfile):
index = generate_tarfile_indices(manifest)
json.dump(index, open(indexfile, 'w'))
index = json.load(open(indexfile))
missing = check_missing_txt_files(index)
rerun_missing(missing, processes=processes)
|