import re SPACE_DIGIT = r'\s*\d\s*' SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT) SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*' SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR) # old style ID, 7 digits in a row RE_NUM_OLD = SPACE_DIGIT*7 # new style ID, 4 digits, ., 4,5 digits RE_NUM_NEW = ( SPACE_DIGIT*4 + r'\.' + SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT) ) # the version part v1 V2 v 1, etc RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?' # the word arxiv, as printed by the autotex, arXiv RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*' # any words within square brackets [cs.A I] RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD) # two digit date, month, year "29 Jan 2012" RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}') # the full identifier for the banner RE_ARXIV_ID = ( RE_ARXIV + r'(?:' + r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) + r')' + RE_VERSION + RE_CATEGORIES + RE_DATE ) REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID) def _extract_arxiv_stamp(txt): """ Find location of stamp within the text and remove that section """ match = REGEX_ARXIV_ID.search(txt) if not match: return txt, '' s, e = match.span() return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip() def remove_stamp(txt, split=1000): """ Given full text, remove the stamp placed in the pdf by arxiv itself. This deserves a bit of consideration since the stamp often becomes mangled by the text extraction tool (i.e. hard to find and replace) and can be reversed. Parameters ---------- txt : string The full text of a document Returns ------- out : string Full text without stamp """ t0, t1 = txt[:split], txt[split:] txt0, stamp0 = _extract_arxiv_stamp(t0) txt1, stamp1 = _extract_arxiv_stamp(t0[::-1]) if stamp0: return txt0 + t1 elif stamp1: return txt1[::-1] + t1 else: return txt