Spaces:

sidphbot
/

Researcher

Build error

App Files Files Community

Researcher / arxiv_public_data /fixunicode.py

sidphbot

spaces init

a8d4e3d over 3 years ago

raw

history blame

3.1 kB

	# -- coding: utf-8 --
	import re
	import unicodedata

	"""
	List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature
	MKB removed the following elements from the list:
	- et 🙰 U+1F670 🙰
	- ſs, ſz ẞ, ß U+00DF ß

	Additional notes:
	* Some classes of characters were listed in the original utf8 fixes but I'm not
	sure they don't belong elsewhere (end user processing). In these cases, pass
	through unidecode should normalize them to proper ascii. They are listed here
	with reasoning:

	- Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf
	r'[\u0300-\u036F]': ''

	- Ditch chars that sometimes (incorrectly?) appear as combining diacritics
	r'(?:\xa8\|[\u02C0-\u02DF])': ''

	* Should we run ftfy?
	"""

	ligature_table = """
	AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ
	AE, ae Æ, æ U+00C6, U+00E6 Æ æ
	AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ
	AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ
	AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ
	AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ
	AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ
	ff ﬀ U+FB00 ﬀ
	ffi ﬃ U+FB03 ﬃ
	ffl ﬄ U+FB04 ﬄ
	fi ﬁ U+FB01 ﬁ
	fl ﬂ U+FB02 ﬂ
	OE, oe Œ, œ U+0152, U+0153 &OElig; &oelig;
	OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ
	st ﬆ U+FB06 ﬆ
	ſt ﬅ U+FB05 ﬅ
	TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ
	ue ᵫ U+1D6B ᵫ
	VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ
	db ȸ U+0238 ȸ
	dz ʣ U+02A3 ʣ
	dʑ ʥ U+02A5 ʥ
	dʒ ʤ U+02A4 ʤ
	fŋ ʩ U+02A9 ʩ
	IJ, ij Ĳ, ĳ U+0132, U+0133 Ĳ ĳ
	ls ʪ U+02AA ʪ
	lz ʫ U+02AB ʫ
	lʒ ɮ U+026E ɮ
	qp ȹ U+0239 ȹ
	tɕ ʨ U+02A8 ʨ
	ts ʦ U+02A6 ʦ
	tʃ ʧ U+02A7 ʧ
	ui ꭐ U+AB50 ꭐ
	ui ꭑ U+AB51 ꭐ
	"""

	unicode_mapping = {}

	for row in ligature_table.split('\n'):
	if row.count('\t') <= 1:
	continue

	unicode_mapping.update(
	{
	u.strip(): unicodedata.normalize('NFKC', a.strip())
	for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]])
	}
	)

	unicode_mapping.update({
	# 'ẞ, ß': careful, some use this for \beta
	r'(\B)\u00DF': r'\1ss',

	# Additions (manual normalization that we feel is important)
	# unicode space u'\xa0' (not \x{0c} = ^L keep!)
	'\xa0': ' ',

	# single + double quotes, dash, and asterisk
	r'[\u2018\u2019]': r"'",
	r'[\u201C\u201D]': r'"',
	r'[\xad\u2014]': r'-',
	r'\xb7': r'*'
	})


	def fix_unicode(txt: str) -> str:
	"""
	Given UTF-8 encoded text, remove typographical ligatures (normalize to true
	non-display character set) and do a general normalization of the unicode
	so that possible redundant characters and simplified to a single set.

	Parameters
	----------
	txt : unicode string

	Returns
	-------
	output : unicode string
	"""
	for search, replace in unicode_mapping.items():
	txt = re.subn(search, replace, txt)[0]
	return unicodedata.normalize('NFKC', txt)