micahg commited on
Commit
f11b13a
1 Parent(s): 3f61297

added rules for asterisk script; fixed nasalized vowel combination issues

Browse files
epitran/data/post/asterisk.txt CHANGED
@@ -1,19 +1,19 @@
1
- ɑ̃ɑ -> ɑ̃ː / _
2
- ɑɑ̃ -> ɑ̃ː / _
3
- ɑɑ -> ɑː / _
4
 
5
- ẽe -> ẽː / _
6
- eẽ -> ẽː / _
7
- ee -> eː / _
8
 
9
- ĩi -> ĩː / _
10
- iĩ -> ĩː / _
11
- ii -> iː / _
12
 
13
- ɔ̃ɔ -> ɔ̃ː / _
14
- ɔɔ̃ -> ɔ̃ː / _
15
- ɔɔ -> ɔː / _
16
 
17
- ũu -> ũː / _
18
- uũ -> ũː / _
19
- uu -> uː / _
 
1
+ %ɑ̃ɑ -> ɑ̃ː / _
2
+ %ɑɑ̃ -> ɑ̃ː / _
3
+ %ɑɑ -> ɑː / _
4
 
5
+ %ẽe -> ẽː / _
6
+ %eẽ -> ẽː / _
7
+ %ee -> eː / _
8
 
9
+ %ĩi -> ĩː / _
10
+ %iĩ -> ĩː / _
11
+ %ii -> iː / _
12
 
13
+ %ɔ̃ɔ -> ɔ̃ː / _
14
+ %ɔɔ̃ -> ɔ̃ː / _
15
+ %ɔɔ -> ɔː / _
16
 
17
+ %ũu -> ũː / _
18
+ %uũ -> ũː / _
19
+ %uu -> uː / _
epitran/data/post/rhg-lroh.txt CHANGED
@@ -1,19 +1,19 @@
1
- ɑ̃ɑ -> ɑ̃ː / _
2
- ɑɑ̃ -> ɑ̃ː / _
3
- ɑɑ -> ɑː / _
4
 
5
- ẽe -> ẽː / _
6
- eẽ -> ẽː / _
7
- ee -> eː / _
8
 
9
- ĩi -> ĩː / _
10
- iĩ -> ĩː / _
11
- ii -> iː / _
12
 
13
- ɔ̃ɔ -> ɔ̃ː / _
14
- ɔɔ̃ -> ɔ̃ː / _
15
- ɔɔ -> ɔː / _
16
 
17
- ũu -> ũː / _
18
- uũ -> ũː / _
19
- uu -> uː / _
 
1
+ %ɑ̃ɑ -> ɑ̃ː / _
2
+ %ɑɑ̃ -> ɑ̃ː / _
3
+ %ɑɑ -> ɑː / _
4
 
5
+ %ẽe -> ẽː / _
6
+ %eẽ -> ẽː / _
7
+ %ee -> eː / _
8
 
9
+ %ĩi -> ĩː / _
10
+ %iĩ -> ĩː / _
11
+ %ii -> iː / _
12
 
13
+ %ɔ̃ɔ -> ɔ̃ː / _
14
+ %ɔɔ̃ -> ɔ̃ː / _
15
+ %ɔɔ -> ɔː / _
16
 
17
+ %ũu -> ũː / _
18
+ %uũ -> ũː / _
19
+ %uu -> uː / _
epitran/data/post/rhg-roheng-old.txt CHANGED
@@ -1,14 +1,14 @@
1
- ɑɑ̃ -> ɑ̃ː / _
2
- ɑɑ -> ɑː / _
3
 
4
- eẽ -> ẽː / _
5
- ee -> eː / _
6
 
7
- iĩ -> ĩː / _
8
- ii -> iː / _
9
 
10
- ɔɔ̃ -> ɔ̃ː / _
11
- oo -> ɔː / _
12
 
13
- uũ -> ũː / _
14
- uu -> uː / _
 
1
+ %ɑɑ̃ -> ɑ̃ː / _
2
+ %ɑɑ -> ɑː / _
3
 
4
+ %eẽ -> ẽː / _
5
+ %ee -> eː / _
6
 
7
+ %iĩ -> ĩː / _
8
+ %ii -> iː / _
9
 
10
+ %ɔɔ̃ -> ɔ̃ː / _
11
+ %oo -> ɔː / _
12
 
13
+ %uũ -> ũː / _
14
+ %uu -> uː / _
epitran/data/post/rhg-roheng.txt CHANGED
@@ -1,19 +1,19 @@
1
- ɑ̃ɑ -> ɑ̃ː / _
2
- ɑɑ̃ -> ɑ̃ː / _
3
- ɑɑ -> ɑː / _
4
 
5
- ẽe -> ẽː / _
6
- eẽ -> ẽː / _
7
- ee -> eː / _
8
 
9
- ĩi -> ĩː / _
10
- iĩ -> ĩː / _
11
- ii -> iː / _
12
 
13
- ɔ̃ɔ -> ɔ̃ː / _
14
- ɔɔ̃ -> ɔ̃ː / _
15
- ɔɔ -> ɔː / _
16
 
17
- ũu -> ũː / _
18
- uũ -> ũː / _
19
- uu -> uː / _
 
1
+ %ɑ̃ɑ -> ɑ̃ː / _
2
+ %ɑɑ̃ -> ɑ̃ː / _
3
+ %ɑɑ -> ɑː / _
4
 
5
+ %ẽe -> ẽː / _
6
+ %eẽ -> ẽː / _
7
+ %ee -> eː / _
8
 
9
+ %ĩi -> ĩː / _
10
+ %iĩ -> ĩː / _
11
+ %ii -> iː / _
12
 
13
+ %ɔ̃ɔ -> ɔ̃ː / _
14
+ %ɔɔ̃ -> ɔ̃ː / _
15
+ %ɔɔ -> ɔː / _
16
 
17
+ %ũu -> ũː / _
18
+ %uũ -> ũː / _
19
+ %uu -> uː / _
epitran/data/pre/asterisk.txt CHANGED
@@ -7,7 +7,13 @@
7
  í -> i / _
8
  ó -> o / _
9
  ú -> u / _
10
- ' -> 0 / _
 
 
 
 
 
 
11
 
12
  % vowel glides
13
  w -> 0 / (u|u\*) _ (a|e|i|o)
@@ -18,4 +24,4 @@ a\* -> ɑ̃ / _
18
  e\* -> ẽ / _
19
  i\* -> ĩ / _
20
  o\* -> ɔ̃ / _
21
- u\* -> ũ / _
 
7
  í -> i / _
8
  ó -> o / _
9
  ú -> u / _
10
+
11
+ %----moved to function preprocessing
12
+ %' -> \s / _
13
+ %’ -> \s / _
14
+
15
+ % drop word-final /y/ and /h/ - moved to functions as it treated # as string final, not word-final
16
+ %(y|h) -> 0 / _ #
17
 
18
  % vowel glides
19
  w -> 0 / (u|u\*) _ (a|e|i|o)
 
24
  e\* -> ẽ / _
25
  i\* -> ĩ / _
26
  o\* -> ɔ̃ / _
27
+ u\* -> ũ / _
epitran/data/pre/rhg-lroh.txt CHANGED
@@ -9,9 +9,5 @@
9
  ú -> u / _
10
 
11
  % vowel glides
12
- w -> 0 / (u|ũ) _ (a|e|i|o)
13
- y -> 0 / (i|ĩ) _ (a|e|o|u)
14
-
15
- % long vowels
16
-
17
- % gemination
 
9
  ú -> u / _
10
 
11
  % vowel glides
12
+ w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o)
13
+ y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u)
 
 
 
 
epitran/data/pre/rhg-roheng.txt CHANGED
@@ -9,5 +9,5 @@
9
  ú -> u / _
10
 
11
  % vowel glides
12
- w -> 0 / (u|ũ) _ (a|e|i|o)
13
- y -> 0 / (i|ĩ) _ (a|e|o|u)
 
9
  ú -> u / _
10
 
11
  % vowel glides
12
+ w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o)
13
+ y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u)
functions.py CHANGED
@@ -18,7 +18,7 @@ def to_lroh(s):
18
  s = s.replace('iː', 'ii')
19
  s = s.replace('ɔ̃ː', 'ɔɔ̃')
20
  s = s.replace('ɔː', 'ɔɔ')
21
- s = s.replace('ũː', 'uũ')
22
  s = s.replace('uː', 'uu')
23
 
24
  s = s.replace('ɑ', 'a')
@@ -26,6 +26,13 @@ def to_lroh(s):
26
 
27
  s = s.replace('ɔ̃', 'õ')
28
  s = s.replace('ɔ', 'o')
 
 
 
 
 
 
 
29
 
30
  return s
31
 
@@ -83,7 +90,7 @@ def to_roheng(s):
83
  s = s.replace('iː', 'ii')
84
  s = s.replace('ɔ̃ː', 'ɔɔ̃')
85
  s = s.replace('ɔː', 'ɔɔ')
86
- s = s.replace('ũː', 'uũ')
87
  s = s.replace('uː', 'uu')
88
 
89
  s = s.replace('ɑ', 'a')
@@ -95,20 +102,24 @@ def to_roheng(s):
95
  """
96
  glides/dipthongs/trithongs
97
  """
98
- # insert 'y' after i if it is followed by any vowel
99
- #s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
100
- #s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
101
-
 
 
 
 
102
  words=s.split(' ')
103
 
104
  for i in range(len(words)):
105
  # trithongs
106
- if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
107
- words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
108
- words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
109
  # dipthongs/glides
110
- elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
111
- words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
112
 
113
  temp_s = ' '.join(words)
114
 
@@ -120,11 +131,20 @@ def convert_script(input_script, output_script, input_text):
120
 
121
  epi = epitran.Epitran(input_script)
122
 
123
- # initial step to account for 'R' in the asterisk step -
124
- # replaces non-word initial 'R's with 'rh' for Epitran processing
125
  if (input_script == 'asterisk'):
 
126
  input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
127
  input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
 
 
 
 
 
 
 
 
 
128
 
129
  #print (input_text)
130
  lines = input_text.split('\n')
@@ -134,11 +154,19 @@ def convert_script(input_script, output_script, input_text):
134
  # store indices for capitalized words (will assume only first letter is capitalized)
135
  words = line.split()
136
  capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
 
 
 
 
 
137
  #print (capital_indices)
 
138
 
 
 
139
  grapheme_text = epi.transliterate(line)
140
 
141
- #print (grapheme_text)
142
 
143
  if output_script == 'rhg-roheng-old':
144
  inter_text = to_roheng_old(grapheme_text)
@@ -154,16 +182,14 @@ def convert_script(input_script, output_script, input_text):
154
  for i in capital_indices:
155
  if i < len(words):
156
  words[i] = words[i].capitalize()
157
-
 
 
 
 
158
  output_line = ' '.join(words)
159
  output_text = output_text + output_line + '\n'
160
 
161
  #print (output_text + '\n##################################################\n')
162
 
163
- return output_text.strip()
164
-
165
- # Issues:
166
- #
167
- # ou
168
- # glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?)
169
- # stress
 
18
  s = s.replace('iː', 'ii')
19
  s = s.replace('ɔ̃ː', 'ɔɔ̃')
20
  s = s.replace('ɔː', 'ɔɔ')
21
+ s = s.replace('ũː', '')
22
  s = s.replace('uː', 'uu')
23
 
24
  s = s.replace('ɑ', 'a')
 
26
 
27
  s = s.replace('ɔ̃', 'õ')
28
  s = s.replace('ɔ', 'o')
29
+
30
+ # step to standardize all nasalized vowels as precomposed characters
31
+ s = re.sub('ã', 'ã', s)
32
+ s = re.sub('ẽ', 'ẽ', s)
33
+ s = re.sub('ĩ', 'ĩ', s)
34
+ s = re.sub('õ', 'õ', s)
35
+ s = re.sub('ũ', 'ũ', s)
36
 
37
  return s
38
 
 
90
  s = s.replace('iː', 'ii')
91
  s = s.replace('ɔ̃ː', 'ɔɔ̃')
92
  s = s.replace('ɔː', 'ɔɔ')
93
+ s = s.replace('ũː', '')
94
  s = s.replace('uː', 'uu')
95
 
96
  s = s.replace('ɑ', 'a')
 
102
  """
103
  glides/dipthongs/trithongs
104
  """
105
+
106
+ # step to standardize all nasalized vowels as precomposed characters
107
+ s = re.sub('ã', 'ã', s)
108
+ s = re.sub('ẽ', 'ẽ', s)
109
+ s = re.sub('ĩ', 'ĩ', s)
110
+ s = re.sub('õ', 'õ', s)
111
+ s = re.sub('ũ', 'ũ', s)
112
+
113
  words=s.split(' ')
114
 
115
  for i in range(len(words)):
116
  # trithongs
117
+ #if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
118
+ words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
119
+ words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
120
  # dipthongs/glides
121
+ #elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
122
+ words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
123
 
124
  temp_s = ' '.join(words)
125
 
 
131
 
132
  epi = epitran.Epitran(input_script)
133
 
134
+ # initial steps for asterisk script
 
135
  if (input_script == 'asterisk'):
136
+ # replaces non-word-initial 'R's with 'rh' for Epitran processing
137
  input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
138
  input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
139
+ # non-word-initial/final hyphens and apostrophes/single quotes
140
+ input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text)
141
+ # remove word final y/h
142
+ input_text = re.sub(r'[yh]\b', '', input_text)
143
+ # double every single j
144
+ input_text = re.sub('j', 'jj', input_text)
145
+ input_text = re.sub('J', 'Jj', input_text)
146
+ input_text = re.sub('jjjj', 'jj', input_text)
147
+ input_text = re.sub('jjj', 'j', input_text)
148
 
149
  #print (input_text)
150
  lines = input_text.split('\n')
 
154
  # store indices for capitalized words (will assume only first letter is capitalized)
155
  words = line.split()
156
  capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
157
+ capital_quote_indices = [i for i, word in enumerate(words)
158
+ if word and
159
+ word[0] in ('\"', '“', '\'', '’') and
160
+ word[1].isupper()
161
+ ]
162
  #print (capital_indices)
163
+ #print (capital_quotes_)
164
 
165
+ #print (f'Before epitran: {line}')
166
+
167
  grapheme_text = epi.transliterate(line)
168
 
169
+ #print (f'After epitran: {grapheme_text}')
170
 
171
  if output_script == 'rhg-roheng-old':
172
  inter_text = to_roheng_old(grapheme_text)
 
182
  for i in capital_indices:
183
  if i < len(words):
184
  words[i] = words[i].capitalize()
185
+ for i in capital_quote_indices:
186
+ if i < len(words):
187
+ if len(words[i]) > 1:
188
+ words[i] = words[i][0] + words[i][1].upper() + words[i][2:]
189
+
190
  output_line = ' '.join(words)
191
  output_text = output_text + output_line + '\n'
192
 
193
  #print (output_text + '\n##################################################\n')
194
 
195
+ return output_text.strip()