Spaces:

sidphbot
/

Researcher

Build error

App Files Files Community

Researcher / arxiv_public_data /pdfstamp.py

sidphbot

spaces init

a8d4e3d about 2 years ago

raw history blame

No virus

1.99 kB

	import re

	SPACE_DIGIT = r'\s\d\s'
	SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT)
	SPACE_CHAR = r'\s[a-zA-Z\.-]\s'
	SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR)

	# old style ID, 7 digits in a row
	RE_NUM_OLD = SPACE_DIGIT*7

	# new style ID, 4 digits, ., 4,5 digits
	RE_NUM_NEW = (
	SPACE_DIGIT*4 +
	r'\.' +
	SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT)
	)

	# the version part v1 V2 v 1, etc
	RE_VERSION = r'(?:\s[vV]\s\d+\s*)?'

	# the word arxiv, as printed by the autotex, arXiv
	RE_ARXIV = r'\sa\sr\sX\si\sv\s:\s*'

	# any words within square brackets [cs.A I]
	RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD)

	# two digit date, month, year "29 Jan 2012"
	RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}')

	# the full identifier for the banner
	RE_ARXIV_ID = (
	RE_ARXIV +
	r'(?:' +
	r'(?:{})\|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) +
	r')' +
	RE_VERSION +
	RE_CATEGORIES +
	RE_DATE
	)

	REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID)


	def _extract_arxiv_stamp(txt):
	"""
	Find location of stamp within the text and remove that section
	"""
	match = REGEX_ARXIV_ID.search(txt)

	if not match:
	return txt, ''

	s, e = match.span()
	return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip()


	def remove_stamp(txt, split=1000):
	"""
	Given full text, remove the stamp placed in the pdf by arxiv itself. This
	deserves a bit of consideration since the stamp often becomes mangled by
	the text extraction tool (i.e. hard to find and replace) and can be
	reversed.

	Parameters
	----------
	txt : string
	The full text of a document

	Returns
	-------
	out : string
	Full text without stamp
	"""
	t0, t1 = txt[:split], txt[split:]
	txt0, stamp0 = _extract_arxiv_stamp(t0)
	txt1, stamp1 = _extract_arxiv_stamp(t0[::-1])

	if stamp0:
	return txt0 + t1
	elif stamp1:
	return txt1[::-1] + t1
	else:
	return txt