Spaces:
Sleeping
Sleeping
cointegrated
commited on
Commit
•
7a5c5b0
1
Parent(s):
6f08302
apply the "less diacritics" rules for lat -> cyr
Browse files- myv_translit.py +20 -1
- test_translit.py +17 -0
myv_translit.py
CHANGED
@@ -138,12 +138,28 @@ _cyr2lat_first_e = [
|
|
138 |
{'find_what': '\\bě', 'replacer': 'e', 're': True},
|
139 |
]
|
140 |
|
|
|
|
|
|
|
|
|
|
|
141 |
_cyr2lat_soft_l_after_vowels = [
|
142 |
# joint acutes | disjoint acutes
|
143 |
{'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
|
144 |
{'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
|
145 |
]
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
_lat2cyr = [
|
148 |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
149 |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
@@ -299,8 +315,11 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
|
|
299 |
|
300 |
|
301 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
302 |
-
|
|
|
303 |
text = transliterate_with_rules(text, _lat2cyr)
|
|
|
|
|
304 |
text = transliterate_with_rules(text, _lat2cyr_special_cases)
|
305 |
return text
|
306 |
|
|
|
138 |
{'find_what': '\\bě', 'replacer': 'e', 're': True},
|
139 |
]
|
140 |
|
141 |
+
_lat2cyr_first_e = [
|
142 |
+
{'find_what': '\\bE', 'replacer': 'Ě', 're': True},
|
143 |
+
{'find_what': '\\be', 'replacer': 'ě', 're': True},
|
144 |
+
]
|
145 |
+
|
146 |
_cyr2lat_soft_l_after_vowels = [
|
147 |
# joint acutes | disjoint acutes
|
148 |
{'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
|
149 |
{'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
|
150 |
]
|
151 |
|
152 |
+
_lat2cyr_soft_l_after_vowels = [
|
153 |
+
# add the soft sign, but only if the next letter is not softening
|
154 |
+
{'find_what': '([иэеИЭЕ])(Л)\\b', 'replacer': '\\1Ль', 're': True},
|
155 |
+
{'find_what': '([иэеИЭЕ])(л)\\b', 'replacer': '\\1ль', 're': True},
|
156 |
+
{'find_what': '([иэеИЭЕ])(Л)([^ьъиеюяю])', 'replacer': '\\1ЛЬ\\3', 're': True},
|
157 |
+
{'find_what': '([иэеИЭЕ])(л)([^ьъиеюяю])', 'replacer': '\\1ль\\3', 're': True},
|
158 |
+
# special cases when L is still hard
|
159 |
+
# todo: fix all the exclusions from the list in https://t.me/ravo_club/9776
|
160 |
+
{'find_what': '([иэеИЭЕ][Лл])([Ьь])(ГАД|ГАВТ|гад|гавт)', 'replacer': '\\1\\3', 're': True},
|
161 |
+
]
|
162 |
+
|
163 |
_lat2cyr = [
|
164 |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
165 |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
|
|
315 |
|
316 |
|
317 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
318 |
+
if not first_e_with_hacek:
|
319 |
+
text = transliterate_with_rules(text, _lat2cyr_first_e)
|
320 |
text = transliterate_with_rules(text, _lat2cyr)
|
321 |
+
if not soft_l_after_vowels:
|
322 |
+
text = transliterate_with_rules(text, _lat2cyr_soft_l_after_vowels)
|
323 |
text = transliterate_with_rules(text, _lat2cyr_special_cases)
|
324 |
return text
|
325 |
|
test_translit.py
CHANGED
@@ -53,6 +53,23 @@ def test_consistency():
|
|
53 |
line_cyr2 = lat2cyr(line_lat)
|
54 |
assert line_cyr == line_cyr2
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def test_zontik():
|
58 |
with open('examples/zontik_cyr.txt', 'r') as f:
|
|
|
53 |
line_cyr2 = lat2cyr(line_lat)
|
54 |
assert line_cyr == line_cyr2
|
55 |
|
56 |
+
for line_cyr in lines:
|
57 |
+
line_lat = cyr2lat(line_cyr, joint_acute=False)
|
58 |
+
line_cyr2 = lat2cyr(line_lat, joint_acute=False)
|
59 |
+
assert line_cyr == line_cyr2
|
60 |
+
|
61 |
+
for line_cyr in lines:
|
62 |
+
line_lat = cyr2lat(line_cyr, soft_l_after_vowels=False)
|
63 |
+
line_cyr2 = lat2cyr(line_lat, soft_l_after_vowels=False)
|
64 |
+
assert line_cyr == line_cyr2
|
65 |
+
|
66 |
+
for line_cyr in lines:
|
67 |
+
if ' ежос' in line_cyr: # normally, this does not happen in the Erzya language
|
68 |
+
continue
|
69 |
+
line_lat = cyr2lat(line_cyr, first_e_with_hacek=False)
|
70 |
+
line_cyr2 = lat2cyr(line_lat, first_e_with_hacek=False)
|
71 |
+
assert line_cyr == line_cyr2
|
72 |
+
|
73 |
|
74 |
def test_zontik():
|
75 |
with open('examples/zontik_cyr.txt', 'r') as f:
|