Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

micahg commited on Feb 5

Commit

f11b13a

•

1 Parent(s): 3f61297

added rules for asterisk script; fixed nasalized vowel combination issues

Browse files

Files changed (8) hide show

epitran/data/post/asterisk.txt +15 -15
epitran/data/post/rhg-lroh.txt +15 -15
epitran/data/post/rhg-roheng-old.txt +10 -10
epitran/data/post/rhg-roheng.txt +15 -15
epitran/data/pre/asterisk.txt +8 -2
epitran/data/pre/rhg-lroh.txt +2 -6
epitran/data/pre/rhg-roheng.txt +2 -2
functions.py +48 -22

epitran/data/post/asterisk.txt CHANGED Viewed

@@ -1,19 +1,19 @@
-ɑ̃ɑ -> ɑ̃ː / _
-ɑɑ̃ -> ɑ̃ː / _
-ɑɑ -> ɑː / _
-ẽe -> ẽː / _
-eẽ -> ẽː / _
-ee -> eː / _
-ĩi -> ĩː / _
-iĩ -> ĩː / _
-ii -> iː / _
-ɔ̃ɔ -> ɔ̃ː / _
-ɔɔ̃ -> ɔ̃ː / _
-ɔɔ -> ɔː / _
-ũu -> ũː / _
-uũ -> ũː / _
-uu -> uː / _

+%ɑ̃ɑ -> ɑ̃ː / _
+%ɑɑ̃ -> ɑ̃ː / _
+%ɑɑ -> ɑː / _
+%ẽe -> ẽː / _
+%eẽ -> ẽː / _
+%ee -> eː / _
+%ĩi -> ĩː / _
+%iĩ -> ĩː / _
+%ii -> iː / _
+%ɔ̃ɔ -> ɔ̃ː / _
+%ɔɔ̃ -> ɔ̃ː / _
+%ɔɔ -> ɔː / _
+%ũu -> ũː / _
+%uũ -> ũː / _
+%uu -> uː / _

epitran/data/post/rhg-lroh.txt CHANGED Viewed

@@ -1,19 +1,19 @@
-ɑ̃ɑ -> ɑ̃ː / _
-ɑɑ̃ -> ɑ̃ː / _
-ɑɑ -> ɑː / _
-ẽe -> ẽː / _
-eẽ -> ẽː / _
-ee -> eː / _
-ĩi -> ĩː / _
-iĩ -> ĩː / _
-ii -> iː / _
-ɔ̃ɔ -> ɔ̃ː / _
-ɔɔ̃ -> ɔ̃ː / _
-ɔɔ -> ɔː / _
-ũu -> ũː / _
-uũ -> ũː / _
-uu -> uː / _

+%ɑ̃ɑ -> ɑ̃ː / _
+%ɑɑ̃ -> ɑ̃ː / _
+%ɑɑ -> ɑː / _
+%ẽe -> ẽː / _
+%eẽ -> ẽː / _
+%ee -> eː / _
+%ĩi -> ĩː / _
+%iĩ -> ĩː / _
+%ii -> iː / _
+%ɔ̃ɔ -> ɔ̃ː / _
+%ɔɔ̃ -> ɔ̃ː / _
+%ɔɔ -> ɔː / _
+%ũu -> ũː / _
+%uũ -> ũː / _
+%uu -> uː / _

epitran/data/post/rhg-roheng-old.txt CHANGED Viewed

@@ -1,14 +1,14 @@
-ɑɑ̃ -> ɑ̃ː / _
-ɑɑ -> ɑː / _
-eẽ -> ẽː / _
-ee -> eː / _
-iĩ -> ĩː / _
-ii -> iː / _
-ɔɔ̃ -> ɔ̃ː / _
-oo -> ɔː / _
-uũ -> ũː / _
-uu -> uː / _

+%ɑɑ̃ -> ɑ̃ː / _
+%ɑɑ -> ɑː / _
+%eẽ -> ẽː / _
+%ee -> eː / _
+%iĩ -> ĩː / _
+%ii -> iː / _
+%ɔɔ̃ -> ɔ̃ː / _
+%oo -> ɔː / _
+%uũ -> ũː / _
+%uu -> uː / _

epitran/data/post/rhg-roheng.txt CHANGED Viewed

@@ -1,19 +1,19 @@
-ɑ̃ɑ -> ɑ̃ː / _
-ɑɑ̃ -> ɑ̃ː / _
-ɑɑ -> ɑː / _
-ẽe -> ẽː / _
-eẽ -> ẽː / _
-ee -> eː / _
-ĩi -> ĩː / _
-iĩ -> ĩː / _
-ii -> iː / _
-ɔ̃ɔ -> ɔ̃ː / _
-ɔɔ̃ -> ɔ̃ː / _
-ɔɔ -> ɔː / _
-ũu -> ũː / _
-uũ -> ũː / _
-uu -> uː / _

+%ɑ̃ɑ -> ɑ̃ː / _
+%ɑɑ̃ -> ɑ̃ː / _
+%ɑɑ -> ɑː / _
+%ẽe -> ẽː / _
+%eẽ -> ẽː / _
+%ee -> eː / _
+%ĩi -> ĩː / _
+%iĩ -> ĩː / _
+%ii -> iː / _
+%ɔ̃ɔ -> ɔ̃ː / _
+%ɔɔ̃ -> ɔ̃ː / _
+%ɔɔ -> ɔː / _
+%ũu -> ũː / _
+%uũ -> ũː / _
+%uu -> uː / _

epitran/data/pre/asterisk.txt CHANGED Viewed

@@ -7,7 +7,13 @@
 í -> i / _
 ó -> o / _
 ú -> u / _
-' -> 0 / _
 % vowel glides
 w -> 0 / (u|u\*) _ (a|e|i|o)
@@ -18,4 +24,4 @@ a\* -> ɑ̃ / _
 e\* -> ẽ / _
 i\* -> ĩ / _
 o\* -> ɔ̃ / _
-u\* -> ũ / _

 í -> i / _
 ó -> o / _
 ú -> u / _
+%----moved to function preprocessing
+%' -> \s / _
+%’ -> \s / _
+% drop word-final /y/ and /h/ - moved to functions as it treated # as string final, not word-final
+%(y|h) -> 0 / _ #
 % vowel glides
 w -> 0 / (u|u\*) _ (a|e|i|o)
 e\* -> ẽ / _
 i\* -> ĩ / _
 o\* -> ɔ̃ / _
+u\* -> ũ / _

epitran/data/pre/rhg-lroh.txt CHANGED Viewed

@@ -9,9 +9,5 @@
 ú -> u / _
 % vowel glides
-w -> 0 / (u|ũ) _ (a|e|i|o)
-y -> 0 / (i|ĩ) _ (a|e|o|u)
-% long vowels
-% gemination

 ú -> u / _
 % vowel glides
+w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o|õ)
+y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u|ũ)

epitran/data/pre/rhg-roheng.txt CHANGED Viewed

@@ -9,5 +9,5 @@
 ú -> u / _
 % vowel glides
-w -> 0 / (u|ũ) _ (a|e|i|o)
-y -> 0 / (i|ĩ) _ (a|e|o|u)

 ú -> u / _
 % vowel glides
+w -> 0 / (u|ũ) _ (a|ã|e|ẽ|i|ĩ|o|õ)
+y -> 0 / (i|ĩ) _ (a|ã|e|ẽ|o|õ|u|ũ)

functions.py CHANGED Viewed

@@ -18,7 +18,7 @@ def to_lroh(s):
     s = s.replace('iː', 'ii')
     s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
-    s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
     s = s.replace('ɑ', 'a')
@@ -26,6 +26,13 @@ def to_lroh(s):
     s = s.replace('ɔ̃', 'õ')
     s = s.replace('ɔ', 'o')
     return s
@@ -83,7 +90,7 @@ def to_roheng(s):
     s = s.replace('iː', 'ii')
     s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
-    s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
     s = s.replace('ɑ', 'a')
@@ -95,20 +102,24 @@ def to_roheng(s):
     """
     glides/dipthongs/trithongs
     """
-    # insert 'y' after i if it is followed by any vowel
-    #s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
-    #s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
     words=s.split(' ')
     for i in range(len(words)):
         # trithongs
-        if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
-            words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
-            words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
         # dipthongs/glides
-        elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
-            words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
     temp_s = ' '.join(words)
@@ -120,11 +131,20 @@ def convert_script(input_script, output_script, input_text):
     epi = epitran.Epitran(input_script)
-    # initial step to account for 'R' in the asterisk step -
-    # replaces non-word initial 'R's with 'rh' for Epitran processing
     if (input_script == 'asterisk'):
         input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
         input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
     #print (input_text)
     lines = input_text.split('\n')
@@ -134,11 +154,19 @@ def convert_script(input_script, output_script, input_text):
         # store indices for capitalized words (will assume only first letter is capitalized)
         words = line.split()
         capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
         #print (capital_indices)
         grapheme_text = epi.transliterate(line)
-        #print (grapheme_text)
         if output_script == 'rhg-roheng-old':
             inter_text = to_roheng_old(grapheme_text)
@@ -154,16 +182,14 @@ def convert_script(input_script, output_script, input_text):
         for i in capital_indices:
             if i < len(words):
                 words[i] = words[i].capitalize()
         output_line = ' '.join(words)
         output_text = output_text + output_line + '\n'
     #print (output_text + '\n##################################################\n')
-    return output_text.strip()
-# Issues:
-#
-# ou
-# glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?)
-# stress

     s = s.replace('iː', 'ii')
     s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
+    s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
     s = s.replace('ɑ', 'a')
     s = s.replace('ɔ̃', 'õ')
     s = s.replace('ɔ', 'o')
+    # step to standardize all nasalized vowels as precomposed characters
+    s = re.sub('ã', 'ã', s)
+    s = re.sub('ẽ', 'ẽ', s)
+    s = re.sub('ĩ', 'ĩ', s)
+    s = re.sub('õ', 'õ', s)
+    s = re.sub('ũ', 'ũ', s)
     return s
     s = s.replace('iː', 'ii')
     s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
+    s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
     s = s.replace('ɑ', 'a')
     """
     glides/dipthongs/trithongs
     """
+    # step to standardize all nasalized vowels as precomposed characters
+    s = re.sub('ã', 'ã', s)
+    s = re.sub('ẽ', 'ẽ', s)
+    s = re.sub('ĩ', 'ĩ', s)
+    s = re.sub('õ', 'õ', s)
+    s = re.sub('ũ', 'ũ', s)
     words=s.split(' ')
     for i in range(len(words)):
         # trithongs
+        #if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
+        words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
+        words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
         # dipthongs/glides
+        #elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
+        words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
     temp_s = ' '.join(words)
     epi = epitran.Epitran(input_script)
+    # initial steps for asterisk script
     if (input_script == 'asterisk'):
+        # replaces non-word-initial 'R's with 'rh' for Epitran processing
         input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
         input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
+        # non-word-initial/final hyphens and apostrophes/single quotes
+        input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text)
+        # remove word final y/h
+        input_text = re.sub(r'[yh]\b', '', input_text)
+        # double every single j
+        input_text = re.sub('j', 'jj', input_text)
+        input_text = re.sub('J', 'Jj', input_text)
+        input_text = re.sub('jjjj', 'jj', input_text)
+        input_text = re.sub('jjj', 'j', input_text)
     #print (input_text)
     lines = input_text.split('\n')
         # store indices for capitalized words (will assume only first letter is capitalized)
         words = line.split()
         capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
+        capital_quote_indices = [i for i, word in enumerate(words)
+            if word and
+            word[0] in ('\"', '“', '\'', '’') and
+            word[1].isupper()
+            ]
         #print (capital_indices)
+        #print (capital_quotes_)
+        #print (f'Before epitran: {line}')
         grapheme_text = epi.transliterate(line)
+        #print (f'After epitran: {grapheme_text}')
         if output_script == 'rhg-roheng-old':
             inter_text = to_roheng_old(grapheme_text)
         for i in capital_indices:
             if i < len(words):
                 words[i] = words[i].capitalize()
+        for i in capital_quote_indices:
+            if i < len(words):
+                if len(words[i]) > 1:
+                    words[i] = words[i][0] + words[i][1].upper() + words[i][2:]
         output_line = ' '.join(words)
         output_text = output_text + output_line + '\n'
     #print (output_text + '\n##################################################\n')
+    return output_text.strip()