Spaces:
Build error
Build error
import re | |
SPACE_DIGIT = r'\s*\d\s*' | |
SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT) | |
SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*' | |
SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR) | |
# old style ID, 7 digits in a row | |
RE_NUM_OLD = SPACE_DIGIT*7 | |
# new style ID, 4 digits, ., 4,5 digits | |
RE_NUM_NEW = ( | |
SPACE_DIGIT*4 + | |
r'\.' + | |
SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT) | |
) | |
# the version part v1 V2 v 1, etc | |
RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?' | |
# the word arxiv, as printed by the autotex, arXiv | |
RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*' | |
# any words within square brackets [cs.A I] | |
RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD) | |
# two digit date, month, year "29 Jan 2012" | |
RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}') | |
# the full identifier for the banner | |
RE_ARXIV_ID = ( | |
RE_ARXIV + | |
r'(?:' + | |
r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) + | |
r')' + | |
RE_VERSION + | |
RE_CATEGORIES + | |
RE_DATE | |
) | |
REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID) | |
def _extract_arxiv_stamp(txt): | |
""" | |
Find location of stamp within the text and remove that section | |
""" | |
match = REGEX_ARXIV_ID.search(txt) | |
if not match: | |
return txt, '' | |
s, e = match.span() | |
return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip() | |
def remove_stamp(txt, split=1000): | |
""" | |
Given full text, remove the stamp placed in the pdf by arxiv itself. This | |
deserves a bit of consideration since the stamp often becomes mangled by | |
the text extraction tool (i.e. hard to find and replace) and can be | |
reversed. | |
Parameters | |
---------- | |
txt : string | |
The full text of a document | |
Returns | |
------- | |
out : string | |
Full text without stamp | |
""" | |
t0, t1 = txt[:split], txt[split:] | |
txt0, stamp0 = _extract_arxiv_stamp(t0) | |
txt1, stamp1 = _extract_arxiv_stamp(t0[::-1]) | |
if stamp0: | |
return txt0 + t1 | |
elif stamp1: | |
return txt1[::-1] + t1 | |
else: | |
return txt | |