Spaces:
Sleeping
Sleeping
cointegrated
commited on
Commit
•
1cfde26
1
Parent(s):
d8067a5
The first version of the app
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +43 -0
- myv_translit.py +268 -0
- test_translit.py +23 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.idea
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 📈
|
4 |
colorFrom: gray
|
5 |
colorTo: green
|
|
|
1 |
---
|
2 |
+
title: Erzya Translit
|
3 |
emoji: 📈
|
4 |
colorFrom: gray
|
5 |
colorTo: green
|
app.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from myv_translit import lat2cyr, cyr2lat
|
4 |
+
|
5 |
+
|
6 |
+
def transliterator(input_text, direction_to_latn=1, joint_acute=True, not_first_e_with_hacek=False, not_soft_l_after_vowels=True):
|
7 |
+
first_e_with_hacek = not not_first_e_with_hacek
|
8 |
+
soft_l_after_vowels = not not_soft_l_after_vowels
|
9 |
+
if direction_to_latn:
|
10 |
+
result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
11 |
+
else:
|
12 |
+
result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
13 |
+
return result
|
14 |
+
|
15 |
+
|
16 |
+
article = """
|
17 |
+
Это автоматический транслитератор между кириллицей и латиницей для эрянского языка.
|
18 |
+
|
19 |
+
В основе - алгоритм Михаила Потапова:
|
20 |
+
- https://github.com/potapoff271083/automatic_translation_latin_to_cyrillic
|
21 |
+
- http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
|
22 |
+
"""
|
23 |
+
|
24 |
+
directions = ['lat -> кир', 'кир -> lat']
|
25 |
+
|
26 |
+
|
27 |
+
interface = gr.Interface(
|
28 |
+
transliterator,
|
29 |
+
[
|
30 |
+
gr.Textbox(label="Text", lines=2, placeholder='text to transliterate'),
|
31 |
+
gr.Radio(choices=directions, type="index", interactive=True, value=directions[0]),
|
32 |
+
gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
|
33 |
+
gr.Checkbox(value=False, label='ěrzä -> erzä'),
|
34 |
+
gr.Checkbox(value=False, label='peĺks -> pelks'),
|
35 |
+
],
|
36 |
+
"text",
|
37 |
+
title='Эрзянь транслитератор',
|
38 |
+
article=article,
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
if __name__ == '__main__':
|
43 |
+
interface.launch()
|
myv_translit.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
_cyr2lat = [
|
5 |
+
{'find_what': 'А', 'replacer': 'A', 're': False},
|
6 |
+
{'find_what': 'а', 'replacer': 'a', 're': False},
|
7 |
+
{'find_what': 'О', 'replacer': 'O', 're': False},
|
8 |
+
{'find_what': 'о', 'replacer': 'o', 're': False},
|
9 |
+
{'find_what': 'У', 'replacer': 'U', 're': False},
|
10 |
+
{'find_what': 'у', 'replacer': 'u', 're': False},
|
11 |
+
{'find_what': 'Ы', 'replacer': 'Y', 're': False},
|
12 |
+
{'find_what': 'ы', 'replacer': 'y', 're': False},
|
13 |
+
{'find_what': 'И', 'replacer': 'I', 're': False},
|
14 |
+
{'find_what': 'и', 'replacer': 'i', 're': False},
|
15 |
+
{'find_what': 'Е', 'replacer': 'E', 're': False},
|
16 |
+
{'find_what': 'е', 'replacer': 'e', 're': False},
|
17 |
+
{'find_what': 'Б', 'replacer': 'B', 're': False},
|
18 |
+
{'find_what': 'б', 'replacer': 'b', 're': False},
|
19 |
+
{'find_what': 'В', 'replacer': 'V', 're': False},
|
20 |
+
{'find_what': 'в', 'replacer': 'v', 're': False},
|
21 |
+
{'find_what': 'Г', 'replacer': 'G', 're': False},
|
22 |
+
{'find_what': 'г', 'replacer': 'g', 're': False},
|
23 |
+
{'find_what': 'Д', 'replacer': 'D', 're': False},
|
24 |
+
{'find_what': 'д', 'replacer': 'd', 're': False},
|
25 |
+
{'find_what': 'З', 'replacer': 'Z', 're': False},
|
26 |
+
{'find_what': 'з', 'replacer': 'z', 're': False},
|
27 |
+
{'find_what': 'К', 'replacer': 'K', 're': False},
|
28 |
+
{'find_what': 'к', 'replacer': 'k', 're': False},
|
29 |
+
{'find_what': 'Л', 'replacer': 'L', 're': False},
|
30 |
+
{'find_what': 'л', 'replacer': 'l', 're': False},
|
31 |
+
{'find_what': 'М', 'replacer': 'M', 're': False},
|
32 |
+
{'find_what': 'м', 'replacer': 'm', 're': False},
|
33 |
+
{'find_what': 'Н', 'replacer': 'N', 're': False},
|
34 |
+
{'find_what': 'н', 'replacer': 'n', 're': False},
|
35 |
+
{'find_what': 'П', 'replacer': 'P', 're': False},
|
36 |
+
{'find_what': 'п', 'replacer': 'p', 're': False},
|
37 |
+
{'find_what': 'Р', 'replacer': 'R', 're': False},
|
38 |
+
{'find_what': 'р', 'replacer': 'r', 're': False},
|
39 |
+
{'find_what': 'С', 'replacer': 'S', 're': False},
|
40 |
+
{'find_what': 'с', 'replacer': 's', 're': False},
|
41 |
+
{'find_what': 'Т', 'replacer': 'T', 're': False},
|
42 |
+
{'find_what': 'т', 'replacer': 't', 're': False},
|
43 |
+
{'find_what': 'Ф', 'replacer': 'F', 're': False},
|
44 |
+
{'find_what': 'ф', 'replacer': 'f', 're': False},
|
45 |
+
{'find_what': 'Х', 'replacer': 'H', 're': False},
|
46 |
+
{'find_what': 'х', 'replacer': 'h', 're': False},
|
47 |
+
{'find_what': 'Ц', 'replacer': 'C', 're': False},
|
48 |
+
{'find_what': 'ц', 'replacer': 'c', 're': False},
|
49 |
+
{'find_what': 'Ч', 'replacer': 'Č', 're': False},
|
50 |
+
{'find_what': 'ч', 'replacer': 'č', 're': False},
|
51 |
+
{'find_what': 'Ш', 'replacer': 'Š', 're': False},
|
52 |
+
{'find_what': 'ш', 'replacer': 'š', 're': False},
|
53 |
+
{'find_what': 'Ж', 'replacer': 'Ž', 're': False},
|
54 |
+
{'find_what': 'ж', 'replacer': 'ž', 're': False},
|
55 |
+
{'find_what': 'Щ', 'replacer': 'Čš', 're': False},
|
56 |
+
{'find_what': 'щ', 'replacer': 'čš', 're': False},
|
57 |
+
{'find_what': 'Ь', 'replacer': '́', 're': False},
|
58 |
+
{'find_what': 'ь', 'replacer': '́', 're': False},
|
59 |
+
{'find_what': 'Й', 'replacer': 'J', 're': False},
|
60 |
+
{'find_what': 'й', 'replacer': 'j', 're': False},
|
61 |
+
{'find_what': 'Ъ', 'replacer': '', 're': False},
|
62 |
+
{'find_what': 'ъ', 'replacer': '', 're': False},
|
63 |
+
{'find_what': 'Э', 'replacer': 'Ě', 're': False},
|
64 |
+
{'find_what': 'э', 'replacer': 'ě', 're': False},
|
65 |
+
{'find_what': 'Я', 'replacer': 'Ä', 're': False},
|
66 |
+
{'find_what': 'я', 'replacer': 'ä', 're': False},
|
67 |
+
{'find_what': 'Ю', 'replacer': 'Ü', 're': False},
|
68 |
+
{'find_what': 'ю', 'replacer': 'ü', 're': False},
|
69 |
+
{'find_what': 'Ё', 'replacer': 'Ö', 're': False},
|
70 |
+
{'find_what': 'ё', 'replacer': 'ö', 're': False},
|
71 |
+
{'find_what': '\\bö\\b', 'replacer': 'jo', 're': True},
|
72 |
+
{'find_what': '\\bÖ\\b', 'replacer': 'Jo', 're': True},
|
73 |
+
{'find_what': '\\bü\\b', 'replacer': 'ju', 're': True},
|
74 |
+
{'find_what': '\\bÜ\\b', 'replacer': 'Ju', 're': True},
|
75 |
+
{'find_what': '\\bä\\b', 'replacer': 'ja', 're': True},
|
76 |
+
{'find_what': '\\bÄ\\b', 'replacer': 'Ja', 're': True},
|
77 |
+
{'find_what': '(\\bö)([a-zöäüšžčě])', 'replacer': 'jo\\2', 're': True},
|
78 |
+
{'find_what': '(\\bä)([a-zöäüšžčě])', 'replacer': 'ja\\2', 're': True},
|
79 |
+
{'find_what': '(\\bü)([a-zöäüšžčě])', 'replacer': 'ju\\2', 're': True},
|
80 |
+
{'find_what': '(\\bÖ)([a-zöäüšžčě])', 'replacer': 'Jo\\2', 're': True},
|
81 |
+
{'find_what': '(\\bÄ)([a-zöäüšžčě])', 'replacer': 'Ja\\2', 're': True},
|
82 |
+
{'find_what': '(\\bÜ)([a-zöäüšžčě])', 'replacer': 'Ju\\2', 're': True},
|
83 |
+
{'find_what': '(\\bö)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'jo\\2', 're': True},
|
84 |
+
{'find_what': '(\\bä)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'ja\\2', 're': True},
|
85 |
+
{'find_what': '(\\bü)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'ju\\2', 're': True},
|
86 |
+
{'find_what': '(\\bÖ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JO\\2', 're': True},
|
87 |
+
{'find_what': '(\\bÄ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JA\\2', 're': True},
|
88 |
+
{'find_what': '(\\bÜ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JU\\2', 're': True},
|
89 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ä)', 'replacer': '\\1ja', 're': True},
|
90 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ä)', 'replacer': '\\1JA', 're': True},
|
91 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ö)', 'replacer': '\\1jo', 're': True},
|
92 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ö)', 'replacer': '\\1JO', 're': True},
|
93 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ü)', 'replacer': '\\1ju', 're': True},
|
94 |
+
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ü)', 'replacer': '\\1JU', 're': True},
|
95 |
+
]
|
96 |
+
|
97 |
+
_cyr2lat_joint_acutes = [
|
98 |
+
{'find_what': 'ś', 'replacer': 'ś', 're': False},
|
99 |
+
{'find_what': 'ź', 'replacer': 'ź', 're': False},
|
100 |
+
{'find_what': 'ć', 'replacer': 'ć', 're': False},
|
101 |
+
{'find_what': 'ń', 'replacer': 'ń', 're': False},
|
102 |
+
{'find_what': 'ŕ', 'replacer': 'ŕ', 're': False},
|
103 |
+
{'find_what': 't́', 'replacer': 'ť', 're': False},
|
104 |
+
{'find_what': 'd́', 'replacer': 'ď', 're': False},
|
105 |
+
{'find_what': 'ĺ', 'replacer': 'ĺ', 're': False},
|
106 |
+
{'find_what': 'Ś', 'replacer': 'Ś', 're': False},
|
107 |
+
{'find_what': 'Ź', 'replacer': 'Ź', 're': False},
|
108 |
+
{'find_what': 'Ć', 'replacer': 'Ć', 're': False},
|
109 |
+
{'find_what': 'Ń', 'replacer': 'Ń', 're': False},
|
110 |
+
{'find_what': 'T́', 'replacer': 'Ť', 're': False},
|
111 |
+
{'find_what': 'D́', 'replacer': 'Ď', 're': False},
|
112 |
+
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
113 |
+
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
114 |
+
]
|
115 |
+
|
116 |
+
_cyr2lat_first_e = [
|
117 |
+
{'find_what': '\\bĚ', 'replacer': 'E', 're': True},
|
118 |
+
{'find_what': '\\bě', 'replacer': 'e', 're': True},
|
119 |
+
]
|
120 |
+
|
121 |
+
_cyr2lat_soft_l_after_vowels = [
|
122 |
+
# joint acutes | disjoint acutes
|
123 |
+
{'find_what': '([yiěeYIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
|
124 |
+
{'find_what': '([yiěeYIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
|
125 |
+
]
|
126 |
+
|
127 |
+
_lat2cyr = [
|
128 |
+
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
129 |
+
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
130 |
+
{'find_what': 'Ď', 'replacer': 'D́', 're': False},
|
131 |
+
{'find_what': 'Ť', 'replacer': 'T́', 're': False},
|
132 |
+
{'find_what': 'Ń', 'replacer': 'Ń', 're': False},
|
133 |
+
{'find_what': 'Ć', 'replacer': 'Ć', 're': False},
|
134 |
+
{'find_what': 'Ź', 'replacer': 'Ź', 're': False},
|
135 |
+
{'find_what': 'Ś', 'replacer': 'Ś', 're': False},
|
136 |
+
{'find_what': 'ĺ', 'replacer': 'ĺ', 're': False},
|
137 |
+
{'find_what': 'ď', 'replacer': 'd́', 're': False},
|
138 |
+
{'find_what': 'ť', 'replacer': 't́', 're': False},
|
139 |
+
{'find_what': 'ŕ', 'replacer': 'ŕ', 're': False},
|
140 |
+
{'find_what': 'ń', 'replacer': 'ń', 're': False},
|
141 |
+
{'find_what': 'ć', 'replacer': 'ć', 're': False},
|
142 |
+
{'find_what': 'ź', 'replacer': 'ź', 're': False},
|
143 |
+
{'find_what': 'ś', 'replacer': 'ś', 're': False},
|
144 |
+
# {'find_what': '\\1JU', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ü)', 're': True},
|
145 |
+
# {'find_what': '\\1ju', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ü)', 're': True},
|
146 |
+
# {'find_what': '\\1JO', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ö)', 're': True},
|
147 |
+
# {'find_what': '\\1jo', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ö)', 're': True},
|
148 |
+
# {'find_what': '\\1JA', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ä)', 're': True},
|
149 |
+
# {'find_what': '\\1ja', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ä)', 're': True},
|
150 |
+
# {'find_what': 'JU\\2', 'replacer': '(\\bÜ)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
151 |
+
# {'find_what': 'JA\\2', 'replacer': '(\\bÄ)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
152 |
+
# {'find_what': 'JO\\2', 'replacer': '(\\bÖ)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
153 |
+
# {'find_what': 'ju\\2', 'replacer': '(\\bü)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
154 |
+
# {'find_what': 'ja\\2', 'replacer': '(\\bä)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
155 |
+
# {'find_what': 'jo\\2', 'replacer': '(\\bö)([A-ZÖÄÜŠŽČĚ])', 're': True},
|
156 |
+
# {'find_what': 'Ju\\2', 'replacer': '(\\bÜ)([a-zöäüšžčě])', 're': True},
|
157 |
+
# {'find_what': 'Ja\\2', 'replacer': '(\\bÄ)([a-zöäüšžčě])', 're': True},
|
158 |
+
# {'find_what': 'Jo\\2', 'replacer': '(\\bÖ)([a-zöäüšžčě])', 're': True},
|
159 |
+
# {'find_what': 'ju\\2', 'replacer': '(\\bü)([a-zöäüšžčě])', 're': True},
|
160 |
+
# {'find_what': 'ja\\2', 'replacer': '(\\bä)([a-zöäüšžčě])', 're': True},
|
161 |
+
# {'find_what': 'jo\\2', 'replacer': '(\\bö)([a-zöäüšžčě])', 're': True},
|
162 |
+
# {'find_what': 'Ja', 'replacer': '\\bÄ\\b', 're': True},
|
163 |
+
# {'find_what': 'ja', 'replacer': '\\bä\\b', 're': True},
|
164 |
+
# {'find_what': 'Ju', 'replacer': '\\bÜ\\b', 're': True},
|
165 |
+
# {'find_what': 'ju', 'replacer': '\\bü\\b', 're': True},
|
166 |
+
# {'find_what': 'Jo', 'replacer': '\\bÖ\\b', 're': True},
|
167 |
+
# {'find_what': 'jo', 'replacer': '\\bö\\b', 're': True},
|
168 |
+
{'find_what': 'ö', 'replacer': 'ё', 're': False},
|
169 |
+
{'find_what': 'Ö', 'replacer': 'Ё', 're': False},
|
170 |
+
{'find_what': 'ü', 'replacer': 'ю', 're': False},
|
171 |
+
{'find_what': 'Ü', 'replacer': 'Ю', 're': False},
|
172 |
+
{'find_what': 'ä', 'replacer': 'я', 're': False},
|
173 |
+
{'find_what': 'Ä', 'replacer': 'Я', 're': False},
|
174 |
+
{'find_what': 'ě', 'replacer': 'э', 're': False},
|
175 |
+
{'find_what': 'Ě', 'replacer': 'Э', 're': False},
|
176 |
+
# {'find_what': '', 'replacer': 'ъ', 're': False},
|
177 |
+
# {'find_what': '', 'replacer': 'Ъ', 're': False},
|
178 |
+
{'find_what': 'j', 'replacer': 'й', 're': False},
|
179 |
+
{'find_what': 'J', 'replacer': 'Й', 're': False},
|
180 |
+
{'find_what': '́', 'replacer': 'ь', 're': False},
|
181 |
+
{'find_what': '́', 'replacer': 'Ь', 're': False},
|
182 |
+
{'find_what': 'čš', 'replacer': 'щ', 're': False},
|
183 |
+
{'find_what': 'Čš', 'replacer': 'Щ', 're': False},
|
184 |
+
{'find_what': 'ž', 'replacer': 'ж', 're': False},
|
185 |
+
{'find_what': 'Ž', 'replacer': 'Ж', 're': False},
|
186 |
+
{'find_what': 'š', 'replacer': 'ш', 're': False},
|
187 |
+
{'find_what': 'Š', 'replacer': 'Ш', 're': False},
|
188 |
+
{'find_what': 'č', 'replacer': 'ч', 're': False},
|
189 |
+
{'find_what': 'Č', 'replacer': 'Ч', 're': False},
|
190 |
+
{'find_what': 'c', 'replacer': 'ц', 're': False},
|
191 |
+
{'find_what': 'C', 'replacer': 'Ц', 're': False},
|
192 |
+
{'find_what': 'h', 'replacer': 'х', 're': False},
|
193 |
+
{'find_what': 'H', 'replacer': 'Х', 're': False},
|
194 |
+
{'find_what': 'f', 'replacer': 'ф', 're': False},
|
195 |
+
{'find_what': 'F', 'replacer': 'Ф', 're': False},
|
196 |
+
{'find_what': 't', 'replacer': 'т', 're': False},
|
197 |
+
{'find_what': 'T', 'replacer': 'Т', 're': False},
|
198 |
+
{'find_what': 's', 'replacer': 'с', 're': False},
|
199 |
+
{'find_what': 'S', 'replacer': 'С', 're': False},
|
200 |
+
{'find_what': 'r', 'replacer': 'р', 're': False},
|
201 |
+
{'find_what': 'R', 'replacer': 'Р', 're': False},
|
202 |
+
{'find_what': 'p', 'replacer': 'п', 're': False},
|
203 |
+
{'find_what': 'P', 'replacer': 'П', 're': False},
|
204 |
+
{'find_what': 'n', 'replacer': 'н', 're': False},
|
205 |
+
{'find_what': 'N', 'replacer': 'Н', 're': False},
|
206 |
+
{'find_what': 'm', 'replacer': 'м', 're': False},
|
207 |
+
{'find_what': 'M', 'replacer': 'М', 're': False},
|
208 |
+
{'find_what': 'l', 'replacer': 'л', 're': False},
|
209 |
+
{'find_what': 'L', 'replacer': 'Л', 're': False},
|
210 |
+
{'find_what': 'k', 'replacer': 'к', 're': False},
|
211 |
+
{'find_what': 'K', 'replacer': 'К', 're': False},
|
212 |
+
{'find_what': 'z', 'replacer': 'з', 're': False},
|
213 |
+
{'find_what': 'Z', 'replacer': 'З', 're': False},
|
214 |
+
{'find_what': 'd', 'replacer': 'д', 're': False},
|
215 |
+
{'find_what': 'D', 'replacer': 'Д', 're': False},
|
216 |
+
{'find_what': 'g', 'replacer': 'г', 're': False},
|
217 |
+
{'find_what': 'G', 'replacer': 'Г', 're': False},
|
218 |
+
{'find_what': 'v', 'replacer': 'в', 're': False},
|
219 |
+
{'find_what': 'V', 'replacer': 'В', 're': False},
|
220 |
+
{'find_what': 'b', 'replacer': 'б', 're': False},
|
221 |
+
{'find_what': 'B', 'replacer': 'Б', 're': False},
|
222 |
+
{'find_what': 'e', 'replacer': 'е', 're': False},
|
223 |
+
{'find_what': 'E', 'replacer': 'Е', 're': False},
|
224 |
+
{'find_what': 'i', 'replacer': 'и', 're': False},
|
225 |
+
{'find_what': 'I', 'replacer': 'И', 're': False},
|
226 |
+
{'find_what': 'y', 'replacer': 'ы', 're': False},
|
227 |
+
{'find_what': 'Y', 'replacer': 'Ы', 're': False},
|
228 |
+
{'find_what': 'u', 'replacer': 'у', 're': False},
|
229 |
+
{'find_what': 'U', 'replacer': 'У', 're': False},
|
230 |
+
{'find_what': 'o', 'replacer': 'о', 're': False},
|
231 |
+
{'find_what': 'O', 'replacer': 'О', 're': False},
|
232 |
+
{'find_what': 'a', 'replacer': 'а', 're': False},
|
233 |
+
{'find_what': 'A', 'replacer': 'А', 're': False},
|
234 |
+
# ya, yo, yu
|
235 |
+
{'find_what': 'Й[Аа]', 'replacer': 'Я', 're': True},
|
236 |
+
{'find_what': 'й[Аа]', 'replacer': 'я', 're': True},
|
237 |
+
{'find_what': 'Й[Ее]', 'replacer': 'Ё', 're': True},
|
238 |
+
{'find_what': 'й[Ее]', 'replacer': 'ё', 're': True},
|
239 |
+
{'find_what': 'Й[Уу]', 'replacer': 'Ю', 're': True},
|
240 |
+
{'find_what': 'й[Уу]', 'replacer': 'ю', 're': True},
|
241 |
+
# todo: introduce Ъ when appropriate
|
242 |
+
]
|
243 |
+
|
244 |
+
|
245 |
+
def transliterate_with_rules(text, rules):
|
246 |
+
for item in rules:
|
247 |
+
if item.get('re'):
|
248 |
+
text = re.sub(item['find_what'], item['replacer'], text)
|
249 |
+
else:
|
250 |
+
text = text.replace(item['find_what'], item['replacer'])
|
251 |
+
return text
|
252 |
+
|
253 |
+
|
254 |
+
def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
255 |
+
# todo: support all the optional settings
|
256 |
+
text = transliterate_with_rules(text, _cyr2lat)
|
257 |
+
if joint_acute:
|
258 |
+
text = transliterate_with_rules(text, _cyr2lat_joint_acutes)
|
259 |
+
if not first_e_with_hacek:
|
260 |
+
text = transliterate_with_rules(text, _cyr2lat_first_e)
|
261 |
+
if not soft_l_after_vowels:
|
262 |
+
text = transliterate_with_rules(text, _cyr2lat_soft_l_after_vowels)
|
263 |
+
return text
|
264 |
+
|
265 |
+
|
266 |
+
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
267 |
+
# todo: support all the optional settings
|
268 |
+
return transliterate_with_rules(text, _lat2cyr)
|
test_translit.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from myv_translit import cyr2lat
|
2 |
+
|
3 |
+
|
4 |
+
def test_join_acute():
|
5 |
+
assert cyr2lat('кель') == 'keĺ'
|
6 |
+
assert cyr2lat('кель', joint_acute=False) == 'keĺ'
|
7 |
+
assert len(cyr2lat('кель')) == 3
|
8 |
+
assert len(cyr2lat('кель', joint_acute=False)) == 4
|
9 |
+
|
10 |
+
|
11 |
+
def test_first_e():
|
12 |
+
assert cyr2lat('эрзя') == 'ěrzä'
|
13 |
+
assert cyr2lat('эрзя', first_e_with_hacek=False) == 'erzä'
|
14 |
+
|
15 |
+
|
16 |
+
def test_soft_l():
|
17 |
+
assert cyr2lat('пелькс') == 'peĺks'
|
18 |
+
assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
|
19 |
+
|
20 |
+
|
21 |
+
# todo: test on a larger corpus
|
22 |
+
# todo: test cyclical consistency
|
23 |
+
|