cointegrated commited on
Commit
7a5c5b0
1 Parent(s): 6f08302

apply the "less diacritics" rules for lat -> cyr

Browse files
Files changed (2) hide show
  1. myv_translit.py +20 -1
  2. test_translit.py +17 -0
myv_translit.py CHANGED
@@ -138,12 +138,28 @@ _cyr2lat_first_e = [
138
  {'find_what': '\\bě', 'replacer': 'e', 're': True},
139
  ]
140
 
 
 
 
 
 
141
  _cyr2lat_soft_l_after_vowels = [
142
  # joint acutes | disjoint acutes
143
  {'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
144
  {'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
145
  ]
146
 
 
 
 
 
 
 
 
 
 
 
 
147
  _lat2cyr = [
148
  {'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
149
  {'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
@@ -299,8 +315,11 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
299
 
300
 
301
  def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
302
- # todo: support all the optional settings
 
303
  text = transliterate_with_rules(text, _lat2cyr)
 
 
304
  text = transliterate_with_rules(text, _lat2cyr_special_cases)
305
  return text
306
 
 
138
  {'find_what': '\\bě', 'replacer': 'e', 're': True},
139
  ]
140
 
141
+ _lat2cyr_first_e = [
142
+ {'find_what': '\\bE', 'replacer': 'Ě', 're': True},
143
+ {'find_what': '\\be', 'replacer': 'ě', 're': True},
144
+ ]
145
+
146
  _cyr2lat_soft_l_after_vowels = [
147
  # joint acutes | disjoint acutes
148
  {'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
149
  {'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
150
  ]
151
 
152
+ _lat2cyr_soft_l_after_vowels = [
153
+ # add the soft sign, but only if the next letter is not softening
154
+ {'find_what': '([иэеИЭЕ])(Л)\\b', 'replacer': '\\1Ль', 're': True},
155
+ {'find_what': '([иэеИЭЕ])(л)\\b', 'replacer': '\\1ль', 're': True},
156
+ {'find_what': '([иэеИЭЕ])(Л)([^ьъиеюяю])', 'replacer': '\\1ЛЬ\\3', 're': True},
157
+ {'find_what': '([иэеИЭЕ])(л)([^ьъиеюяю])', 'replacer': '\\1ль\\3', 're': True},
158
+ # special cases when L is still hard
159
+ # todo: fix all the exclusions from the list in https://t.me/ravo_club/9776
160
+ {'find_what': '([иэеИЭЕ][Лл])([Ьь])(ГАД|ГАВТ|гад|гавт)', 'replacer': '\\1\\3', 're': True},
161
+ ]
162
+
163
  _lat2cyr = [
164
  {'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
165
  {'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
 
315
 
316
 
317
  def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
318
+ if not first_e_with_hacek:
319
+ text = transliterate_with_rules(text, _lat2cyr_first_e)
320
  text = transliterate_with_rules(text, _lat2cyr)
321
+ if not soft_l_after_vowels:
322
+ text = transliterate_with_rules(text, _lat2cyr_soft_l_after_vowels)
323
  text = transliterate_with_rules(text, _lat2cyr_special_cases)
324
  return text
325
 
test_translit.py CHANGED
@@ -53,6 +53,23 @@ def test_consistency():
53
  line_cyr2 = lat2cyr(line_lat)
54
  assert line_cyr == line_cyr2
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def test_zontik():
58
  with open('examples/zontik_cyr.txt', 'r') as f:
 
53
  line_cyr2 = lat2cyr(line_lat)
54
  assert line_cyr == line_cyr2
55
 
56
+ for line_cyr in lines:
57
+ line_lat = cyr2lat(line_cyr, joint_acute=False)
58
+ line_cyr2 = lat2cyr(line_lat, joint_acute=False)
59
+ assert line_cyr == line_cyr2
60
+
61
+ for line_cyr in lines:
62
+ line_lat = cyr2lat(line_cyr, soft_l_after_vowels=False)
63
+ line_cyr2 = lat2cyr(line_lat, soft_l_after_vowels=False)
64
+ assert line_cyr == line_cyr2
65
+
66
+ for line_cyr in lines:
67
+ if ' ежос' in line_cyr: # normally, this does not happen in the Erzya language
68
+ continue
69
+ line_lat = cyr2lat(line_cyr, first_e_with_hacek=False)
70
+ line_cyr2 = lat2cyr(line_lat, first_e_with_hacek=False)
71
+ assert line_cyr == line_cyr2
72
+
73
 
74
  def test_zontik():
75
  with open('examples/zontik_cyr.txt', 'r') as f: