import epitran import re def to_lroh(s): s = s.replace('ɖ', 'ḍ') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ṛ') s = s.replace('ʃ', 'š') s = s.replace('ʈ', 'ṭ') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'ɑɑ̃') s = s.replace('ɑː', 'ɑɑ') s = s.replace('ẽː', 'eẽ') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iĩ') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔɔ̃') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uũ') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'ã') s = s.replace('ɔ̃', 'õ') s = s.replace('ɔ', 'o') return s def to_roheng_old(s): s = s.replace('ɖ', 'dh') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ç') s = s.replace('ʃ', 'c') s = s.replace('ʈ', 'th') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'aña') s = s.replace('ɑː', 'aa') s = s.replace('ẽː', 'eñe') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iñi') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔñɔ') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uñu') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'añ') s = s.replace('ɔ̃', 'oñ') s = s.replace('ɔ', 'o') s = s.replace('ã', 'añ') # composite s = s.replace('ã', 'añ') # two point codes s = s.replace('ẽ', 'eñ') s = s.replace('ẽ', 'eñ') s = s.replace('ĩ', 'iñ') s = s.replace('ĩ', 'iñ') s = s.replace('ũ', 'uñ') s = s.replace('ũ', 'uñ') return s def to_roheng(s): s = s.replace('ɖ', 'ḍ') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ṛ') s = s.replace('ʃ', 'c') s = s.replace('ʈ', 'ṭ') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'ɑɑ̃') s = s.replace('ɑː', 'ɑɑ') s = s.replace('ẽː', 'eẽ') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iĩ') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔɔ̃') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uũ') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'ã') s = s.replace('ɔ̃', 'õ') s = s.replace('ɔ', 'o') """ glides/dipthongs/trithongs """ # insert 'y' after i if it is followed by any vowel #s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i #s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s) words=s.split(' ') for i in range(len(words)): # trithongs if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]): words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i]) words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i]) # dipthongs/glides elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]): words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i]) temp_s = ' '.join(words) return temp_s def convert_script(input_script, output_script, input_text): #print (input_text) epi = epitran.Epitran(input_script) # initial step to account for 'R' in the asterisk step - # replaces non-word initial 'R's with 'rh' for Epitran processing if (input_script == 'asterisk'): input_text = re.sub(r'(?<=\B)R', 'rh', input_text) input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary #print (input_text) lines = input_text.split('\n') output_text = '' for line in lines: # store indices for capitalized words (will assume only first letter is capitalized) words = line.split() capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()] #print (capital_indices) grapheme_text = epi.transliterate(line) #print (grapheme_text) if output_script == 'rhg-roheng-old': inter_text = to_roheng_old(grapheme_text) elif output_script == 'rhg-lroh': inter_text = to_lroh(grapheme_text) elif output_script == 'rhg-roheng': inter_text = to_roheng(grapheme_text) #print (inter_text) # reapply capitalization words = inter_text.split() for i in capital_indices: if i < len(words): words[i] = words[i].capitalize() output_line = ' '.join(words) output_text = output_text + output_line + '\n' #print (output_text + '\n##################################################\n') return output_text.strip() # Issues: # # ou # glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?) # stress