File size: 1,986 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re

SPACE_DIGIT = r'\s*\d\s*'
SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT)
SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*'
SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR)

# old style ID, 7 digits in a row
RE_NUM_OLD = SPACE_DIGIT*7

# new style ID, 4 digits, ., 4,5 digits
RE_NUM_NEW = (
    SPACE_DIGIT*4 +
    r'\.' +
    SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT)
)

# the version part v1 V2 v 1, etc
RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?'

# the word arxiv, as printed by the autotex, arXiv
RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*'

# any words within square brackets [cs.A I]
RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD)

# two digit date, month, year "29 Jan 2012"
RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}')

# the full identifier for the banner
RE_ARXIV_ID = (
    RE_ARXIV +
    r'(?:' +
    r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) +
    r')' +
    RE_VERSION +
    RE_CATEGORIES +
    RE_DATE
)

REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID)


def _extract_arxiv_stamp(txt):
    """
    Find location of stamp within the text and remove that section
    """
    match = REGEX_ARXIV_ID.search(txt)

    if not match:
        return txt, ''

    s, e = match.span()
    return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip()


def remove_stamp(txt, split=1000):
    """
    Given full text, remove the stamp placed in the pdf by arxiv itself. This
    deserves a bit of consideration since the stamp often becomes mangled by
    the text extraction tool (i.e. hard to find and replace) and can be
    reversed.

    Parameters
    ----------
    txt : string
        The full text of a document

    Returns
    -------
    out : string
        Full text without stamp
    """
    t0, t1 = txt[:split], txt[split:]
    txt0, stamp0 = _extract_arxiv_stamp(t0)
    txt1, stamp1 = _extract_arxiv_stamp(t0[::-1])

    if stamp0:
        return txt0 + t1
    elif stamp1:
        return txt1[::-1] + t1
    else:
        return txt