Spaces:
Sleeping
Sleeping
import epitran | |
import re | |
import string | |
def to_lroh(s): | |
s = s.replace('ɖ', 'ḍ') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ṛ') | |
s = s.replace('ʃ', 'š') | |
s = s.replace('ʈ', 'ṭ') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'ɑɑ̃') | |
s = s.replace('ɑː', 'ɑɑ') | |
s = s.replace('ẽː', 'eẽ') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iĩ') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔɔ̃') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uũ') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'ã') | |
s = s.replace('ɔ̃', 'õ') | |
s = s.replace('ɔ', 'o') | |
# step to standardize all nasalized vowels as precomposed characters | |
s = re.sub('ã', 'ã', s) | |
s = re.sub('ẽ', 'ẽ', s) | |
s = re.sub('ĩ', 'ĩ', s) | |
s = re.sub('õ', 'õ', s) | |
s = re.sub('ũ', 'ũ', s) | |
return s | |
def to_roheng_old(s): | |
s = s.replace('ɖ', 'dh') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ç') | |
s = s.replace('ʃ', 'c') | |
s = s.replace('ʈ', 'th') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'aña') | |
s = s.replace('ɑː', 'aa') | |
s = s.replace('ẽː', 'eñe') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iñi') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔñɔ') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uñu') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'añ') | |
s = s.replace('ɔ̃', 'oñ') | |
s = s.replace('ɔ', 'o') | |
s = s.replace('ã', 'añ') # composite | |
s = s.replace('ã', 'añ') # two point codes | |
s = s.replace('ẽ', 'eñ') | |
s = s.replace('ẽ', 'eñ') | |
s = s.replace('ĩ', 'iñ') | |
s = s.replace('ĩ', 'iñ') | |
s = s.replace('ũ', 'uñ') | |
s = s.replace('ũ', 'uñ') | |
return s | |
def to_roheng(s): | |
s = s.replace('ɖ', 'ḍ') | |
s = s.replace('ɾ', 'r') | |
s = s.replace('ɽ', 'ṛ') | |
s = s.replace('ʃ', 'c') | |
s = s.replace('ʈ', 'ṭ') | |
s = s.replace('j', 'y') | |
s = s.replace('d͡ʒ', 'j') | |
s = s.replace('ɑ̃ː', 'ɑɑ̃') | |
s = s.replace('ɑː', 'ɑɑ') | |
s = s.replace('ẽː', 'eẽ') | |
s = s.replace('eː', 'ee') | |
s = s.replace('ĩː', 'iĩ') | |
s = s.replace('iː', 'ii') | |
s = s.replace('ɔ̃ː', 'ɔɔ̃') | |
s = s.replace('ɔː', 'ɔɔ') | |
s = s.replace('ũː', 'uũ') | |
s = s.replace('uː', 'uu') | |
s = s.replace('ɑ', 'a') | |
s = s.replace('̃ɑ', 'ã') | |
s = s.replace('ɔ̃', 'õ') | |
s = s.replace('ɔ', 'o') | |
""" | |
glides/dipthongs/trithongs | |
""" | |
# step to standardize all nasalized vowels as precomposed characters | |
s = re.sub('Ã', 'Ã', s) | |
s = re.sub('Ẽ', 'Ẽ', s) | |
s = re.sub('Ĩ', 'Ĩ', s) | |
s = re.sub('Õ', 'Õ', s) | |
s = re.sub('Ũ', 'Ũ', s) | |
s = re.sub('ã', 'ã', s) | |
s = re.sub('ẽ', 'ẽ', s) | |
s = re.sub('ĩ', 'ĩ', s) | |
s = re.sub('õ', 'õ', s) | |
s = re.sub('ũ', 'ũ', s) | |
words=s.split(' ') | |
for i in range(len(words)): | |
# trithongs | |
#if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]): | |
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i]) | |
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i]) | |
# dipthongs/glides | |
#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]): | |
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i]) | |
# spelling errors | |
""" | |
TODO: replace with dictionary to map | |
""" | |
if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in': | |
words[i] = words[i].replace('in', 'iin') | |
elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin': | |
words[i] = words[i].replace('hin', 'hiin') | |
s = ' '.join(words) | |
s = re.sub('uai', 'uwai', s) | |
return s | |
def convert_script(input_script, output_script, input_text): | |
print (input_text) | |
epi = epitran.Epitran(input_script) | |
# initial steps for asterisk script | |
if (input_script == 'asterisk'): | |
# replaces non-word-initial 'R's with 'rh' for Epitran processing | |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text) | |
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary | |
# non-word-initial/final hyphens and apostrophes/single quotes | |
input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text) | |
# remove word final y/h | |
input_text = re.sub(r'[yh]\b', '', input_text) | |
# double every single j | |
input_text = re.sub('j', 'jj', input_text) | |
input_text = re.sub('J', 'Jj', input_text) | |
input_text = re.sub('jjjj', 'jj', input_text) | |
input_text = re.sub('jjj', 'j', input_text) | |
#print (input_text) | |
lines = input_text.split('\n') | |
output_text = '' | |
for line in lines: | |
# store indices for capitalized words (will assume only first letter is capitalized | |
# but check for punctuation) | |
words = line.split() | |
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()] | |
capital_quote_indices = [i for i, word in enumerate(words) | |
if word and | |
word[0] in ('\"', '“', '\'', '’') and | |
len(word) > 1 and # check if quotation mark has a space after it | |
word[1].isupper() | |
] | |
#print (capital_indices) | |
#print (capital_quotes_) | |
#print (f'Before epitran: {line}') | |
grapheme_text = epi.transliterate(line) | |
#print (f'After epitran: {grapheme_text}') | |
if output_script == 'rhg-roheng-old': | |
inter_text = to_roheng_old(grapheme_text) | |
elif output_script == 'rhg-lroh': | |
inter_text = to_lroh(grapheme_text) | |
elif output_script == 'rhg-roheng': | |
inter_text = to_roheng(grapheme_text) | |
#print (inter_text) | |
# reapply capitalization | |
words = inter_text.split() | |
for i in capital_indices: | |
if i < len(words): | |
words[i] = words[i].capitalize() | |
for i in capital_quote_indices: | |
if i < len(words): | |
if len(words[i]) > 1: | |
words[i] = words[i][0] + words[i][1].upper() + words[i][2:] | |
output_line = ' '.join(words) | |
output_text = output_text + output_line + '\n' | |
#print (output_text + '\n##################################################\n') | |
return output_text.strip() |