File size: 2,679 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import subprocess
import shlex
from collections import defaultdict

from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER

def id_to_tarpdf(n):
    if '.' in n:
        ym = n.split('.')[0]
        return '{}/{}.pdf'.format(ym, n)
    else:
        ym = n.split('/')[1][:4]
        return '{}/{}.pdf'.format(ym, n.replace('/', ''))

def _call(cmd, dryrun=False, debug=False):
    """ Spawn a subprocess and execute the string in cmd """
    return subprocess.check_call(
        shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
    )

def _tar_to_filename(filename):
    return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'

def extract_files(tarfile, pdfs, outdir):
    """
    Extract the list of `pdfs` filenames from `tarfile` into the `outdir`
    """
    filename = tarfile
    namelist = ' '.join([id_to_tarpdf(i) for i in pdfs])

    outname = _tar_to_filename(filename)
    basename = os.path.splitext(os.path.basename(filename))[0]
    tdir = os.path.join(DIR_PDFTARS, basename)
    outpdfs = ' '.join([os.path.join(tdir, id_to_tarpdf(i)) for i in pdfs])

    cmd0 = 'tar --one-top-level -C {} -xf {} {}'.format(DIR_PDFTARS, outname, namelist)
    cmd1 = 'cp -a {} {}'.format(outpdfs, outdir)
    cmd2 = 'rm -rf {}'.format(tdir)

    _call(cmd0)
    _call(cmd1)
    _call(cmd2)

def call_list(ai, manifest):
    """
    Convert a list of articles and the tar manifest into a dictionary
    of the tarfiles and the pdfs needed from them.
    """
    inv = {}
    for tar, pdfs in manifest.items():
        for pdf in pdfs:
            inv[pdf] = tar

    tars = defaultdict(list)
    num = 0
    for i in ai:
        aid = i.get('id')
    
        tar = id_to_tarpdf(aid)
        if not tar in inv:
            continue
        tars[inv[id_to_tarpdf(aid)]].append(aid)

    return tars

def extract_by_filter(oai, tarmanifest, func, outdir):
    """
    User-facing function that deals extracts a section of articles from
    the entire arxiv.

    Parameters
    ----------
    oai : list of dicts
        The OAI metadata from `oai_metadata.load_metadata`

    tarmanifest : list of dicts
        Dictionary describing the S3 downloads, `s3_bulk_download.get_manifest`

    func : function
        Filter to apply to OAI metadata to get list of articles

    outdir : string
        Directory in which to place the PDFs and metadata for the slice
    """
    articles = func(oai)
    tarmap = call_list(articles, tarmanifest)

    for tar, pdfs in tarmap.items():
        extract_files(tar, pdfs, outdir=outdir)

    with open(os.path.join(outdir, 'metadata.json'), 'w') as f:
        json.dump(articles, f)