Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| import re | |
| import unicodedata | |
| """ | |
| List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature | |
| MKB removed the following elements from the list: | |
| - et 🙰 U+1F670 🙰 | |
| - ſs, ſz ẞ, ß U+00DF ß | |
| Additional notes: | |
| * Some classes of characters were listed in the original utf8 fixes but I'm not | |
| sure they don't belong elsewhere (end user processing). In these cases, pass | |
| through unidecode should normalize them to proper ascii. They are listed here | |
| with reasoning: | |
| - Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf | |
| r'[\u0300-\u036F]': '' | |
| - Ditch chars that sometimes (incorrectly?) appear as combining diacritics | |
| r'(?:\xa8|[\u02C0-\u02DF])': '' | |
| * Should we run ftfy? | |
| """ | |
| ligature_table = """ | |
| AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ | |
| AE, ae Æ, æ U+00C6, U+00E6 Æ æ | |
| AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ | |
| AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ | |
| AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ | |
| AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ | |
| AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ | |
| ff ff U+FB00 ff | |
| ffi ffi U+FB03 ffi | |
| ffl ffl U+FB04 ffl | |
| fi fi U+FB01 fi | |
| fl fl U+FB02 fl | |
| OE, oe Œ, œ U+0152, U+0153 Œ œ | |
| OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ | |
| st st U+FB06 st | |
| ſt ſt U+FB05 ſt | |
| TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ | |
| ue ᵫ U+1D6B ᵫ | |
| VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ | |
| db ȸ U+0238 ȸ | |
| dz ʣ U+02A3 ʣ | |
| dʑ ʥ U+02A5 ʥ | |
| dʒ ʤ U+02A4 ʤ | |
| fŋ ʩ U+02A9 ʩ | |
| IJ, ij IJ, ij U+0132, U+0133 IJ ij | |
| ls ʪ U+02AA ʪ | |
| lz ʫ U+02AB ʫ | |
| lʒ ɮ U+026E ɮ | |
| qp ȹ U+0239 ȹ | |
| tɕ ʨ U+02A8 ʨ | |
| ts ʦ U+02A6 ʦ | |
| tʃ ʧ U+02A7 ʧ | |
| ui ꭐ U+AB50 ꭐ | |
| ui ꭑ U+AB51 ꭐ | |
| """ | |
| unicode_mapping = {} | |
| for row in ligature_table.split('\n'): | |
| if row.count('\t') <= 1: | |
| continue | |
| unicode_mapping.update( | |
| { | |
| u.strip(): unicodedata.normalize('NFKC', a.strip()) | |
| for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]]) | |
| } | |
| ) | |
| unicode_mapping.update({ | |
| # 'ẞ, ß': careful, some use this for \beta | |
| r'(\B)\u00DF': r'\1ss', | |
| # Additions (manual normalization that we feel is important) | |
| # unicode space u'\xa0' (not \x{0c} = ^L keep!) | |
| '\xa0': ' ', | |
| # single + double quotes, dash, and asterisk | |
| r'[\u2018\u2019]': r"'", | |
| r'[\u201C\u201D]': r'"', | |
| r'[\xad\u2014]': r'-', | |
| r'\xb7': r'*' | |
| }) | |
| def fix_unicode(txt: str) -> str: | |
| """ | |
| Given UTF-8 encoded text, remove typographical ligatures (normalize to true | |
| non-display character set) and do a general normalization of the unicode | |
| so that possible redundant characters and simplified to a single set. | |
| Parameters | |
| ---------- | |
| txt : unicode string | |
| Returns | |
| ------- | |
| output : unicode string | |
| """ | |
| for search, replace in unicode_mapping.items(): | |
| txt = re.subn(search, replace, txt)[0] | |
| return unicodedata.normalize('NFKC', txt) | |