import epitran import re import string def to_lroh(s): s = s.replace('ɖ', 'ḍ') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ṛ') s = s.replace('ʃ', 'š') s = s.replace('ʈ', 'ṭ') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'ɑɑ̃') s = s.replace('ɑː', 'ɑɑ') s = s.replace('ẽː', 'eẽ') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iĩ') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔɔ̃') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uũ') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'ã') s = s.replace('ɔ̃', 'õ') s = s.replace('ɔ', 'o') # step to standardize all nasalized vowels as precomposed characters s = re.sub('ã', 'ã', s) s = re.sub('ẽ', 'ẽ', s) s = re.sub('ĩ', 'ĩ', s) s = re.sub('õ', 'õ', s) s = re.sub('ũ', 'ũ', s) return s def to_roheng_old(s): s = s.replace('ɖ', 'dh') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ç') s = s.replace('ʃ', 'c') s = s.replace('ʈ', 'th') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'aña') s = s.replace('ɑː', 'aa') s = s.replace('ẽː', 'eñe') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iñi') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔñɔ') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uñu') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'añ') s = s.replace('ɔ̃', 'oñ') s = s.replace('ɔ', 'o') s = s.replace('ã', 'añ') # composite s = s.replace('ã', 'añ') # two point codes s = s.replace('ẽ', 'eñ') s = s.replace('ẽ', 'eñ') s = s.replace('ĩ', 'iñ') s = s.replace('ĩ', 'iñ') s = s.replace('ũ', 'uñ') s = s.replace('ũ', 'uñ') return s def to_roheng(s): s = s.replace('ɖ', 'ḍ') s = s.replace('ɾ', 'r') s = s.replace('ɽ', 'ṛ') s = s.replace('ʃ', 'c') s = s.replace('ʈ', 'ṭ') s = s.replace('j', 'y') s = s.replace('d͡ʒ', 'j') s = s.replace('ɑ̃ː', 'ɑɑ̃') s = s.replace('ɑː', 'ɑɑ') s = s.replace('ẽː', 'eẽ') s = s.replace('eː', 'ee') s = s.replace('ĩː', 'iĩ') s = s.replace('iː', 'ii') s = s.replace('ɔ̃ː', 'ɔɔ̃') s = s.replace('ɔː', 'ɔɔ') s = s.replace('ũː', 'uũ') s = s.replace('uː', 'uu') s = s.replace('ɑ', 'a') s = s.replace('̃ɑ', 'ã') s = s.replace('ɔ̃', 'õ') s = s.replace('ɔ', 'o') """ glides/dipthongs/trithongs """ # step to standardize all nasalized vowels as precomposed characters s = re.sub('Ã', 'Ã', s) s = re.sub('Ẽ', 'Ẽ', s) s = re.sub('Ĩ', 'Ĩ', s) s = re.sub('Õ', 'Õ', s) s = re.sub('Ũ', 'Ũ', s) s = re.sub('ã', 'ã', s) s = re.sub('ẽ', 'ẽ', s) s = re.sub('ĩ', 'ĩ', s) s = re.sub('õ', 'õ', s) s = re.sub('ũ', 'ũ', s) words=s.split(' ') for i in range(len(words)): # trithongs #if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]): words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i]) words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i]) # dipthongs/glides #elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]): words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i]) # spelling errors """ TODO: replace with dictionary to map """ if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in': words[i] = words[i].replace('in', 'iin') elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin': words[i] = words[i].replace('hin', 'hiin') s = ' '.join(words) s = re.sub('uai', 'uwai', s) return s def convert_script(input_script, output_script, input_text): print (input_text) epi = epitran.Epitran(input_script) # initial steps for asterisk script if (input_script == 'asterisk'): # replaces non-word-initial 'R's with 'rh' for Epitran processing input_text = re.sub(r'(?<=\B)R', 'rh', input_text) input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary # non-word-initial/final hyphens and apostrophes/single quotes input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text) # remove word final y/h input_text = re.sub(r'[yh]\b', '', input_text) # double every single j input_text = re.sub('j', 'jj', input_text) input_text = re.sub('J', 'Jj', input_text) input_text = re.sub('jjjj', 'jj', input_text) input_text = re.sub('jjj', 'j', input_text) #print (input_text) lines = input_text.split('\n') output_text = '' for line in lines: # store indices for capitalized words (will assume only first letter is capitalized # but check for punctuation) words = line.split() capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()] capital_quote_indices = [i for i, word in enumerate(words) if word and word[0] in ('\"', '“', '\'', '’') and len(word) > 1 and # check if quotation mark has a space after it word[1].isupper() ] #print (capital_indices) #print (capital_quotes_) #print (f'Before epitran: {line}') grapheme_text = epi.transliterate(line) #print (f'After epitran: {grapheme_text}') if output_script == 'rhg-roheng-old': inter_text = to_roheng_old(grapheme_text) elif output_script == 'rhg-lroh': inter_text = to_lroh(grapheme_text) elif output_script == 'rhg-roheng': inter_text = to_roheng(grapheme_text) #print (inter_text) # reapply capitalization words = inter_text.split() for i in capital_indices: if i < len(words): words[i] = words[i].capitalize() for i in capital_quote_indices: if i < len(words): if len(words[i]) > 1: words[i] = words[i][0] + words[i][1].upper() + words[i][2:] output_line = ' '.join(words) output_text = output_text + output_line + '\n' #print (output_text + '\n##################################################\n') return output_text.strip()