Spaces:
Sleeping
Sleeping
added rules for asterisk script; fixed nasalized vowel combination issues
Browse files- epitran/data/post/asterisk.txt +15 -15
- epitran/data/post/rhg-lroh.txt +15 -15
- epitran/data/post/rhg-roheng-old.txt +10 -10
- epitran/data/post/rhg-roheng.txt +15 -15
- epitran/data/pre/asterisk.txt +8 -2
- epitran/data/pre/rhg-lroh.txt +2 -6
- epitran/data/pre/rhg-roheng.txt +2 -2
- functions.py +48 -22
epitran/data/post/asterisk.txt
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
|
5 |
-
|
6 |
-
eẽ -> ẽː / _
|
7 |
-
ee -> eː / _
|
8 |
|
9 |
-
|
10 |
-
iĩ -> ĩː / _
|
11 |
-
ii -> iː / _
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
ũu -> ũː / _
|
18 |
-
uũ -> ũː / _
|
19 |
-
uu -> uː / _
|
|
|
1 |
+
%ɑ̃ɑ -> ɑ̃ː / _
|
2 |
+
%ɑɑ̃ -> ɑ̃ː / _
|
3 |
+
%ɑɑ -> ɑː / _
|
4 |
|
5 |
+
%ẽe -> ẽː / _
|
6 |
+
%eẽ -> ẽː / _
|
7 |
+
%ee -> eː / _
|
8 |
|
9 |
+
%ĩi -> ĩː / _
|
10 |
+
%iĩ -> ĩː / _
|
11 |
+
%ii -> iː / _
|
12 |
|
13 |
+
%ɔ̃ɔ -> ɔ̃ː / _
|
14 |
+
%ɔɔ̃ -> ɔ̃ː / _
|
15 |
+
%ɔɔ -> ɔː / _
|
16 |
|
17 |
+
%ũu -> ũː / _
|
18 |
+
%uũ -> ũː / _
|
19 |
+
%uu -> uː / _
|
epitran/data/post/rhg-lroh.txt
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
|
5 |
-
|
6 |
-
eẽ -> ẽː / _
|
7 |
-
ee -> eː / _
|
8 |
|
9 |
-
|
10 |
-
iĩ -> ĩː / _
|
11 |
-
ii -> iː / _
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
ũu -> ũː / _
|
18 |
-
uũ -> ũː / _
|
19 |
-
uu -> uː / _
|
|
|
1 |
+
%ɑ̃ɑ -> ɑ̃ː / _
|
2 |
+
%ɑɑ̃ -> ɑ̃ː / _
|
3 |
+
%ɑɑ -> ɑː / _
|
4 |
|
5 |
+
%ẽe -> ẽː / _
|
6 |
+
%eẽ -> ẽː / _
|
7 |
+
%ee -> eː / _
|
8 |
|
9 |
+
%ĩi -> ĩː / _
|
10 |
+
%iĩ -> ĩː / _
|
11 |
+
%ii -> iː / _
|
12 |
|
13 |
+
%ɔ̃ɔ -> ɔ̃ː / _
|
14 |
+
%ɔɔ̃ -> ɔ̃ː / _
|
15 |
+
%ɔɔ -> ɔː / _
|
16 |
|
17 |
+
%ũu -> ũː / _
|
18 |
+
%uũ -> ũː / _
|
19 |
+
%uu -> uː / _
|
epitran/data/post/rhg-roheng-old.txt
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
|
4 |
-
eẽ -> ẽː / _
|
5 |
-
ee -> eː / _
|
6 |
|
7 |
-
iĩ -> ĩː / _
|
8 |
-
ii -> iː / _
|
9 |
|
10 |
-
|
11 |
-
oo -> ɔː / _
|
12 |
|
13 |
-
uũ -> ũː / _
|
14 |
-
uu -> uː / _
|
|
|
1 |
+
%ɑɑ̃ -> ɑ̃ː / _
|
2 |
+
%ɑɑ -> ɑː / _
|
3 |
|
4 |
+
%eẽ -> ẽː / _
|
5 |
+
%ee -> eː / _
|
6 |
|
7 |
+
%iĩ -> ĩː / _
|
8 |
+
%ii -> iː / _
|
9 |
|
10 |
+
%ɔɔ̃ -> ɔ̃ː / _
|
11 |
+
%oo -> ɔː / _
|
12 |
|
13 |
+
%uũ -> ũː / _
|
14 |
+
%uu -> uː / _
|
epitran/data/post/rhg-roheng.txt
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
|
5 |
-
|
6 |
-
eẽ -> ẽː / _
|
7 |
-
ee -> eː / _
|
8 |
|
9 |
-
|
10 |
-
iĩ -> ĩː / _
|
11 |
-
ii -> iː / _
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
ũu -> ũː / _
|
18 |
-
uũ -> ũː / _
|
19 |
-
uu -> uː / _
|
|
|
1 |
+
%ɑ̃ɑ -> ɑ̃ː / _
|
2 |
+
%ɑɑ̃ -> ɑ̃ː / _
|
3 |
+
%ɑɑ -> ɑː / _
|
4 |
|
5 |
+
%ẽe -> ẽː / _
|
6 |
+
%eẽ -> ẽː / _
|
7 |
+
%ee -> eː / _
|
8 |
|
9 |
+
%ĩi -> ĩː / _
|
10 |
+
%iĩ -> ĩː / _
|
11 |
+
%ii -> iː / _
|
12 |
|
13 |
+
%ɔ̃ɔ -> ɔ̃ː / _
|
14 |
+
%ɔɔ̃ -> ɔ̃ː / _
|
15 |
+
%ɔɔ -> ɔː / _
|
16 |
|
17 |
+
%ũu -> ũː / _
|
18 |
+
%uũ -> ũː / _
|
19 |
+
%uu -> uː / _
|
epitran/data/pre/asterisk.txt
CHANGED
@@ -7,7 +7,13 @@
|
|
7 |
í -> i / _
|
8 |
ó -> o / _
|
9 |
ú -> u / _
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
% vowel glides
|
13 |
w -> 0 / (u|u\*) _ (a|e|i|o)
|
@@ -18,4 +24,4 @@ a\* -> ɑ̃ / _
|
|
18 |
e\* -> ẽ / _
|
19 |
i\* -> ĩ / _
|
20 |
o\* -> ɔ̃ / _
|
21 |
-
u\* -> ũ / _
|
|
|
7 |
í -> i / _
|
8 |
ó -> o / _
|
9 |
ú -> u / _
|
10 |
+
|
11 |
+
%----moved to function preprocessing
|
12 |
+
%' -> \s / _
|
13 |
+
%’ -> \s / _
|
14 |
+
|
15 |
+
% drop word-final /y/ and /h/ - moved to functions as it treated # as string final, not word-final
|
16 |
+
%(y|h) -> 0 / _ #
|
17 |
|
18 |
% vowel glides
|
19 |
w -> 0 / (u|u\*) _ (a|e|i|o)
|
|
|
24 |
e\* -> ẽ / _
|
25 |
i\* -> ĩ / _
|
26 |
o\* -> ɔ̃ / _
|
27 |
+
u\* -> ũ / _
|
epitran/data/pre/rhg-lroh.txt
CHANGED
@@ -9,9 +9,5 @@
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
-
w -> 0 / (u|ũ) _ (a
|
13 |
-
y -> 0 / (i|ĩ) _ (a
|
14 |
-
|
15 |
-
% long vowels
|
16 |
-
|
17 |
-
% gemination
|
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
+
w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o|õ)
|
13 |
+
y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u|ũ)
|
|
|
|
|
|
|
|
epitran/data/pre/rhg-roheng.txt
CHANGED
@@ -9,5 +9,5 @@
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
-
w -> 0 / (u|ũ) _ (a
|
13 |
-
y -> 0 / (i|ĩ) _ (a
|
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
+
w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o|õ)
|
13 |
+
y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u|ũ)
|
functions.py
CHANGED
@@ -18,7 +18,7 @@ def to_lroh(s):
|
|
18 |
s = s.replace('iː', 'ii')
|
19 |
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
20 |
s = s.replace('ɔː', 'ɔɔ')
|
21 |
-
s = s.replace('ũː', '
|
22 |
s = s.replace('uː', 'uu')
|
23 |
|
24 |
s = s.replace('ɑ', 'a')
|
@@ -26,6 +26,13 @@ def to_lroh(s):
|
|
26 |
|
27 |
s = s.replace('ɔ̃', 'õ')
|
28 |
s = s.replace('ɔ', 'o')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
return s
|
31 |
|
@@ -83,7 +90,7 @@ def to_roheng(s):
|
|
83 |
s = s.replace('iː', 'ii')
|
84 |
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
85 |
s = s.replace('ɔː', 'ɔɔ')
|
86 |
-
s = s.replace('ũː', '
|
87 |
s = s.replace('uː', 'uu')
|
88 |
|
89 |
s = s.replace('ɑ', 'a')
|
@@ -95,20 +102,24 @@ def to_roheng(s):
|
|
95 |
"""
|
96 |
glides/dipthongs/trithongs
|
97 |
"""
|
98 |
-
|
99 |
-
#
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
words=s.split(' ')
|
103 |
|
104 |
for i in range(len(words)):
|
105 |
# trithongs
|
106 |
-
if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
|
107 |
-
|
108 |
-
|
109 |
# dipthongs/glides
|
110 |
-
elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
|
111 |
-
|
112 |
|
113 |
temp_s = ' '.join(words)
|
114 |
|
@@ -120,11 +131,20 @@ def convert_script(input_script, output_script, input_text):
|
|
120 |
|
121 |
epi = epitran.Epitran(input_script)
|
122 |
|
123 |
-
# initial
|
124 |
-
# replaces non-word initial 'R's with 'rh' for Epitran processing
|
125 |
if (input_script == 'asterisk'):
|
|
|
126 |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
|
127 |
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
#print (input_text)
|
130 |
lines = input_text.split('\n')
|
@@ -134,11 +154,19 @@ def convert_script(input_script, output_script, input_text):
|
|
134 |
# store indices for capitalized words (will assume only first letter is capitalized)
|
135 |
words = line.split()
|
136 |
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
|
|
|
|
|
|
|
|
|
|
|
137 |
#print (capital_indices)
|
|
|
138 |
|
|
|
|
|
139 |
grapheme_text = epi.transliterate(line)
|
140 |
|
141 |
-
#print (grapheme_text)
|
142 |
|
143 |
if output_script == 'rhg-roheng-old':
|
144 |
inter_text = to_roheng_old(grapheme_text)
|
@@ -154,16 +182,14 @@ def convert_script(input_script, output_script, input_text):
|
|
154 |
for i in capital_indices:
|
155 |
if i < len(words):
|
156 |
words[i] = words[i].capitalize()
|
157 |
-
|
|
|
|
|
|
|
|
|
158 |
output_line = ' '.join(words)
|
159 |
output_text = output_text + output_line + '\n'
|
160 |
|
161 |
#print (output_text + '\n##################################################\n')
|
162 |
|
163 |
-
return output_text.strip()
|
164 |
-
|
165 |
-
# Issues:
|
166 |
-
#
|
167 |
-
# ou
|
168 |
-
# glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?)
|
169 |
-
# stress
|
|
|
18 |
s = s.replace('iː', 'ii')
|
19 |
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
20 |
s = s.replace('ɔː', 'ɔɔ')
|
21 |
+
s = s.replace('ũː', 'uũ')
|
22 |
s = s.replace('uː', 'uu')
|
23 |
|
24 |
s = s.replace('ɑ', 'a')
|
|
|
26 |
|
27 |
s = s.replace('ɔ̃', 'õ')
|
28 |
s = s.replace('ɔ', 'o')
|
29 |
+
|
30 |
+
# step to standardize all nasalized vowels as precomposed characters
|
31 |
+
s = re.sub('ã', 'ã', s)
|
32 |
+
s = re.sub('ẽ', 'ẽ', s)
|
33 |
+
s = re.sub('ĩ', 'ĩ', s)
|
34 |
+
s = re.sub('õ', 'õ', s)
|
35 |
+
s = re.sub('ũ', 'ũ', s)
|
36 |
|
37 |
return s
|
38 |
|
|
|
90 |
s = s.replace('iː', 'ii')
|
91 |
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
92 |
s = s.replace('ɔː', 'ɔɔ')
|
93 |
+
s = s.replace('ũː', 'uũ')
|
94 |
s = s.replace('uː', 'uu')
|
95 |
|
96 |
s = s.replace('ɑ', 'a')
|
|
|
102 |
"""
|
103 |
glides/dipthongs/trithongs
|
104 |
"""
|
105 |
+
|
106 |
+
# step to standardize all nasalized vowels as precomposed characters
|
107 |
+
s = re.sub('ã', 'ã', s)
|
108 |
+
s = re.sub('ẽ', 'ẽ', s)
|
109 |
+
s = re.sub('ĩ', 'ĩ', s)
|
110 |
+
s = re.sub('õ', 'õ', s)
|
111 |
+
s = re.sub('ũ', 'ũ', s)
|
112 |
+
|
113 |
words=s.split(' ')
|
114 |
|
115 |
for i in range(len(words)):
|
116 |
# trithongs
|
117 |
+
#if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
|
118 |
+
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
|
119 |
+
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
|
120 |
# dipthongs/glides
|
121 |
+
#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
|
122 |
+
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
|
123 |
|
124 |
temp_s = ' '.join(words)
|
125 |
|
|
|
131 |
|
132 |
epi = epitran.Epitran(input_script)
|
133 |
|
134 |
+
# initial steps for asterisk script
|
|
|
135 |
if (input_script == 'asterisk'):
|
136 |
+
# replaces non-word-initial 'R's with 'rh' for Epitran processing
|
137 |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
|
138 |
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
|
139 |
+
# non-word-initial/final hyphens and apostrophes/single quotes
|
140 |
+
input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text)
|
141 |
+
# remove word final y/h
|
142 |
+
input_text = re.sub(r'[yh]\b', '', input_text)
|
143 |
+
# double every single j
|
144 |
+
input_text = re.sub('j', 'jj', input_text)
|
145 |
+
input_text = re.sub('J', 'Jj', input_text)
|
146 |
+
input_text = re.sub('jjjj', 'jj', input_text)
|
147 |
+
input_text = re.sub('jjj', 'j', input_text)
|
148 |
|
149 |
#print (input_text)
|
150 |
lines = input_text.split('\n')
|
|
|
154 |
# store indices for capitalized words (will assume only first letter is capitalized)
|
155 |
words = line.split()
|
156 |
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
|
157 |
+
capital_quote_indices = [i for i, word in enumerate(words)
|
158 |
+
if word and
|
159 |
+
word[0] in ('\"', '“', '\'', '’') and
|
160 |
+
word[1].isupper()
|
161 |
+
]
|
162 |
#print (capital_indices)
|
163 |
+
#print (capital_quotes_)
|
164 |
|
165 |
+
#print (f'Before epitran: {line}')
|
166 |
+
|
167 |
grapheme_text = epi.transliterate(line)
|
168 |
|
169 |
+
#print (f'After epitran: {grapheme_text}')
|
170 |
|
171 |
if output_script == 'rhg-roheng-old':
|
172 |
inter_text = to_roheng_old(grapheme_text)
|
|
|
182 |
for i in capital_indices:
|
183 |
if i < len(words):
|
184 |
words[i] = words[i].capitalize()
|
185 |
+
for i in capital_quote_indices:
|
186 |
+
if i < len(words):
|
187 |
+
if len(words[i]) > 1:
|
188 |
+
words[i] = words[i][0] + words[i][1].upper() + words[i][2:]
|
189 |
+
|
190 |
output_line = ' '.join(words)
|
191 |
output_text = output_text + output_line + '\n'
|
192 |
|
193 |
#print (output_text + '\n##################################################\n')
|
194 |
|
195 |
+
return output_text.strip()
|
|
|
|
|
|
|
|
|
|
|
|