Spaces:
Sleeping
Sleeping
import epitran | |
import re | |
def to_lroh(s): | |
s = s.replace('ɖ', 'ḍ') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ṛ') | |
s = s.replace('ʃ', 'š') | |
s = s.replace('ʈ', 'ṭ') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'ɑɑ̃') | |
s = s.replace('ɑː', 'ɑɑ') | |
s = s.replace('ẽː', 'eẽ') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iĩ') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔɔ̃') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uũ') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'ã') | |
s = s.replace('ɔ̃', 'õ') | |
s = s.replace('ɔ', 'o') | |
return s | |
def to_roheng_old(s): | |
s = s.replace('ɖ', 'dh') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ç') | |
s = s.replace('ʃ', 'c') | |
s = s.replace('ʈ', 'th') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'aña') | |
s = s.replace('ɑː', 'aa') | |
s = s.replace('ẽː', 'eñe') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iñi') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔñɔ') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uñu') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'añ') | |
s = s.replace('ɔ̃', 'oñ') | |
s = s.replace('ɔ', 'o') | |
s = s.replace('ã', 'añ') # composite | |
s = s.replace('ã', 'añ') # two point codes | |
s = s.replace('ẽ', 'eñ') | |
s = s.replace('ẽ', 'eñ') | |
s = s.replace('ĩ', 'iñ') | |
s = s.replace('ĩ', 'iñ') | |
s = s.replace('ũ', 'uñ') | |
s = s.replace('ũ', 'uñ') | |
return s | |
def to_roheng(s): | |
s = s.replace('ɖ', 'ḍ') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ṛ') | |
s = s.replace('ʃ', 'c') | |
s = s.replace('ʈ', 'ṭ') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'ɑɑ̃') | |
s = s.replace('ɑː', 'ɑɑ') | |
s = s.replace('ẽː', 'eẽ') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iĩ') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔɔ̃') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uũ') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'ã') | |
s = s.replace('ɔ̃', 'õ') | |
s = s.replace('ɔ', 'o') | |
""" | |
glides/dipthongs/trithongs | |
""" | |
# insert 'y' after i if it is followed by any vowel | |
#s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i | |
#s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s) | |
words=s.split(' ') | |
for i in range(len(words)): | |
# trithongs | |
if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]): | |
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i]) | |
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i]) | |
# dipthongs/glides | |
elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]): | |
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i]) | |
temp_s = ' '.join(words) | |
return temp_s | |
def convert_script(input_script, output_script, input_text): | |
#print (input_text) | |
epi = epitran.Epitran(input_script) | |
# initial step to account for 'R' in the asterisk step - | |
# replaces non-word initial 'R's with 'rh' for Epitran processing | |
if (input_script == 'asterisk'): | |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text) | |
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary | |
#print (input_text) | |
lines = input_text.split('\n') | |
output_text = '' | |
for line in lines: | |
# store indices for capitalized words (will assume only first letter is capitalized) | |
words = line.split() | |
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()] | |
#print (capital_indices) | |
grapheme_text = epi.transliterate(line) | |
#print (grapheme_text) | |
if output_script == 'rhg-roheng-old': | |
inter_text = to_roheng_old(grapheme_text) | |
elif output_script == 'rhg-lroh': | |
inter_text = to_lroh(grapheme_text) | |
elif output_script == 'rhg-roheng': | |
inter_text = to_roheng(grapheme_text) | |
#print (inter_text) | |
# reapply capitalization | |
words = inter_text.split() | |
for i in capital_indices: | |
if i < len(words): | |
words[i] = words[i].capitalize() | |
output_line = ' '.join(words) | |
output_text = output_text + output_line + '\n' | |
#print (output_text + '\n##################################################\n') | |
return output_text.strip() | |
# Issues: | |
# | |
# ou | |
# glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?) | |
# stress |