Spaces:
Build error
Build error
import os | |
import subprocess | |
import shlex | |
from collections import defaultdict | |
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER | |
def id_to_tarpdf(n): | |
if '.' in n: | |
ym = n.split('.')[0] | |
return '{}/{}.pdf'.format(ym, n) | |
else: | |
ym = n.split('/')[1][:4] | |
return '{}/{}.pdf'.format(ym, n.replace('/', '')) | |
def _call(cmd, dryrun=False, debug=False): | |
""" Spawn a subprocess and execute the string in cmd """ | |
return subprocess.check_call( | |
shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w') | |
) | |
def _tar_to_filename(filename): | |
return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz' | |
def extract_files(tarfile, pdfs, outdir): | |
""" | |
Extract the list of `pdfs` filenames from `tarfile` into the `outdir` | |
""" | |
filename = tarfile | |
namelist = ' '.join([id_to_tarpdf(i) for i in pdfs]) | |
outname = _tar_to_filename(filename) | |
basename = os.path.splitext(os.path.basename(filename))[0] | |
tdir = os.path.join(DIR_PDFTARS, basename) | |
outpdfs = ' '.join([os.path.join(tdir, id_to_tarpdf(i)) for i in pdfs]) | |
cmd0 = 'tar --one-top-level -C {} -xf {} {}'.format(DIR_PDFTARS, outname, namelist) | |
cmd1 = 'cp -a {} {}'.format(outpdfs, outdir) | |
cmd2 = 'rm -rf {}'.format(tdir) | |
_call(cmd0) | |
_call(cmd1) | |
_call(cmd2) | |
def call_list(ai, manifest): | |
""" | |
Convert a list of articles and the tar manifest into a dictionary | |
of the tarfiles and the pdfs needed from them. | |
""" | |
inv = {} | |
for tar, pdfs in manifest.items(): | |
for pdf in pdfs: | |
inv[pdf] = tar | |
tars = defaultdict(list) | |
num = 0 | |
for i in ai: | |
aid = i.get('id') | |
tar = id_to_tarpdf(aid) | |
if not tar in inv: | |
continue | |
tars[inv[id_to_tarpdf(aid)]].append(aid) | |
return tars | |
def extract_by_filter(oai, tarmanifest, func, outdir): | |
""" | |
User-facing function that deals extracts a section of articles from | |
the entire arxiv. | |
Parameters | |
---------- | |
oai : list of dicts | |
The OAI metadata from `oai_metadata.load_metadata` | |
tarmanifest : list of dicts | |
Dictionary describing the S3 downloads, `s3_bulk_download.get_manifest` | |
func : function | |
Filter to apply to OAI metadata to get list of articles | |
outdir : string | |
Directory in which to place the PDFs and metadata for the slice | |
""" | |
articles = func(oai) | |
tarmap = call_list(articles, tarmanifest) | |
for tar, pdfs in tarmap.items(): | |
extract_files(tar, pdfs, outdir=outdir) | |
with open(os.path.join(outdir, 'metadata.json'), 'w') as f: | |
json.dump(articles, f) | |