File size: 3,103 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
import re
import unicodedata

"""
List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature
MKB removed the following elements from the list:
      - et	🙰	U+1F670	🙰
      - ſs, ſz	ẞ, ß	U+00DF	ß

Additional notes:
* Some classes of characters were listed in the original utf8 fixes but I'm not
  sure they don't belong elsewhere (end user processing). In these cases, pass
  through unidecode should normalize them to proper ascii. They are listed here
  with reasoning:

  - Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf
    r'[\u0300-\u036F]': ''

  - Ditch chars that sometimes (incorrectly?) appear as combining diacritics
    r'(?:\xa8|[\u02C0-\u02DF])': ''

* Should we run ftfy?
"""

ligature_table = """
AA, aa	Ꜳ, ꜳ	U+A732, U+A733	Ꜳ ꜳ
AE, ae	Æ, æ	U+00C6, U+00E6	Æ æ
AO, ao	Ꜵ, ꜵ	U+A734, U+A735	Ꜵ ꜵ
AU, au	Ꜷ, ꜷ	U+A736, U+A737	Ꜷ ꜷ
AV, av	Ꜹ, ꜹ	U+A738, U+A739	Ꜹ ꜹ
AV, av 	Ꜻ, ꜻ	U+A73A, U+A73B	Ꜻ ꜻ
AY, ay	Ꜽ, ꜽ	U+A73C, U+A73D	Ꜽ ꜽ
ff	ff	U+FB00	ff
ffi	ffi	U+FB03	ffi
ffl	ffl	U+FB04	ffl
fi	fi	U+FB01	fi
fl	fl	U+FB02	fl
OE, oe	Œ, œ	U+0152, U+0153	Œ œ
OO, oo	Ꝏ, ꝏ	U+A74E, U+A74F	Ꝏ ꝏ
st	st	U+FB06	st
ſt	ſt	U+FB05	ſt
TZ, tz	Ꜩ, ꜩ	U+A728, U+A729	Ꜩ ꜩ
ue	ᵫ	U+1D6B	ᵫ
VY, vy	Ꝡ, ꝡ	U+A760, U+A761	Ꝡ ꝡ
db	ȸ	U+0238	ȸ
dz	ʣ	U+02A3	ʣ
dʑ 	ʥ	U+02A5	ʥ
dʒ 	ʤ	U+02A4	ʤ
fŋ 	ʩ	U+02A9	ʩ
IJ, ij	IJ, ij	U+0132, U+0133	IJ ij
ls	ʪ	U+02AA	ʪ
lz	ʫ	U+02AB	ʫ
lʒ 	ɮ	U+026E	ɮ
qp	ȹ	U+0239	ȹ
tɕ 	ʨ	U+02A8	ʨ
ts	ʦ	U+02A6	ʦ
tʃ 	ʧ	U+02A7	ʧ
ui	ꭐ	U+AB50	ꭐ
ui	ꭑ	U+AB51	ꭐ
"""

unicode_mapping = {}

for row in ligature_table.split('\n'):
    if row.count('\t') <= 1:
        continue

    unicode_mapping.update(
        {
            u.strip(): unicodedata.normalize('NFKC', a.strip())
            for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]])
        }
    )

unicode_mapping.update({
    # 'ẞ, ß': careful, some use this for \beta
    r'(\B)\u00DF': r'\1ss',

    # Additions (manual normalization that we feel is important)
    # unicode space  u'\xa0'  (not \x{0c} = ^L keep!)
    '\xa0': ' ',

    # single + double quotes, dash, and asterisk
    r'[\u2018\u2019]': r"'",
    r'[\u201C\u201D]': r'"',
    r'[\xad\u2014]': r'-',
    r'\xb7': r'*'
})


def fix_unicode(txt: str) -> str:
    """
    Given UTF-8 encoded text, remove typographical ligatures (normalize to true
    non-display character set) and do a general normalization of the unicode
    so that possible redundant characters and simplified to a single set.

    Parameters
    ----------
    txt : unicode string

    Returns
    -------
    output : unicode string
    """
    for search, replace in unicode_mapping.items():
        txt = re.subn(search, replace, txt)[0]
    return unicodedata.normalize('NFKC', txt)