# -*- coding: utf-8 -*- import re import unicodedata """ List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature MKB removed the following elements from the list: - et 🙰 U+1F670 🙰 - ſs, ſz ẞ, ß U+00DF ß Additional notes: * Some classes of characters were listed in the original utf8 fixes but I'm not sure they don't belong elsewhere (end user processing). In these cases, pass through unidecode should normalize them to proper ascii. They are listed here with reasoning: - Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf r'[\u0300-\u036F]': '' - Ditch chars that sometimes (incorrectly?) appear as combining diacritics r'(?:\xa8|[\u02C0-\u02DF])': '' * Should we run ftfy? """ ligature_table = """ AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ AE, ae Æ, æ U+00C6, U+00E6 Æ æ AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ ff ff U+FB00 ff ffi ffi U+FB03 ffi ffl ffl U+FB04 ffl fi fi U+FB01 fi fl fl U+FB02 fl OE, oe Œ, œ U+0152, U+0153 Œ œ OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ st st U+FB06 st ſt ſt U+FB05 ſt TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ ue ᵫ U+1D6B ᵫ VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ db ȸ U+0238 ȸ dz ʣ U+02A3 ʣ dʑ ʥ U+02A5 ʥ dʒ ʤ U+02A4 ʤ fŋ ʩ U+02A9 ʩ IJ, ij IJ, ij U+0132, U+0133 IJ ij ls ʪ U+02AA ʪ lz ʫ U+02AB ʫ lʒ ɮ U+026E ɮ qp ȹ U+0239 ȹ tɕ ʨ U+02A8 ʨ ts ʦ U+02A6 ʦ tʃ ʧ U+02A7 ʧ ui ꭐ U+AB50 ꭐ ui ꭑ U+AB51 ꭐ """ unicode_mapping = {} for row in ligature_table.split('\n'): if row.count('\t') <= 1: continue unicode_mapping.update( { u.strip(): unicodedata.normalize('NFKC', a.strip()) for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]]) } ) unicode_mapping.update({ # 'ẞ, ß': careful, some use this for \beta r'(\B)\u00DF': r'\1ss', # Additions (manual normalization that we feel is important) # unicode space u'\xa0' (not \x{0c} = ^L keep!) '\xa0': ' ', # single + double quotes, dash, and asterisk r'[\u2018\u2019]': r"'", r'[\u201C\u201D]': r'"', r'[\xad\u2014]': r'-', r'\xb7': r'*' }) def fix_unicode(txt: str) -> str: """ Given UTF-8 encoded text, remove typographical ligatures (normalize to true non-display character set) and do a general normalization of the unicode so that possible redundant characters and simplified to a single set. Parameters ---------- txt : unicode string Returns ------- output : unicode string """ for search, replace in unicode_mapping.items(): txt = re.subn(search, replace, txt)[0] return unicodedata.normalize('NFKC', txt)