rhg-script-converter-ui / functions.py
micahg's picture
spelling edits
72b7374
import epitran
import re
import string
def to_lroh(s):
s = s.replace('ɖ', 'ḍ')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ṛ')
s = s.replace('ʃ', 'š')
s = s.replace('ʈ', 'ṭ')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'ɑɑ̃')
s = s.replace('ɑː', 'ɑɑ')
s = s.replace('ẽː', 'eẽ')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iĩ')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔɔ̃')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uũ')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'ã')
s = s.replace('ɔ̃', 'õ')
s = s.replace('ɔ', 'o')
# step to standardize all nasalized vowels as precomposed characters
s = re.sub('ã', 'ã', s)
s = re.sub('ẽ', 'ẽ', s)
s = re.sub('ĩ', 'ĩ', s)
s = re.sub('õ', 'õ', s)
s = re.sub('ũ', 'ũ', s)
return s
def to_roheng_old(s):
s = s.replace('ɖ', 'dh')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ç')
s = s.replace('ʃ', 'c')
s = s.replace('ʈ', 'th')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'aña')
s = s.replace('ɑː', 'aa')
s = s.replace('ẽː', 'eñe')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iñi')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔñɔ')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uñu')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'añ')
s = s.replace('ɔ̃', 'oñ')
s = s.replace('ɔ', 'o')
s = s.replace('ã', 'añ') # composite
s = s.replace('ã', 'añ') # two point codes
s = s.replace('ẽ', 'eñ')
s = s.replace('ẽ', 'eñ')
s = s.replace('ĩ', 'iñ')
s = s.replace('ĩ', 'iñ')
s = s.replace('ũ', 'uñ')
s = s.replace('ũ', 'uñ')
return s
def to_roheng(s):
s = s.replace('ɖ', 'ḍ')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ṛ')
s = s.replace('ʃ', 'c')
s = s.replace('ʈ', 'ṭ')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'ɑɑ̃')
s = s.replace('ɑː', 'ɑɑ')
s = s.replace('ẽː', 'eẽ')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iĩ')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔɔ̃')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uũ')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'ã')
s = s.replace('ɔ̃', 'õ')
s = s.replace('ɔ', 'o')
"""
glides/dipthongs/trithongs
"""
# step to standardize all nasalized vowels as precomposed characters
s = re.sub('Ã', 'Ã', s)
s = re.sub('Ẽ', 'Ẽ', s)
s = re.sub('Ĩ', 'Ĩ', s)
s = re.sub('Õ', 'Õ', s)
s = re.sub('Ũ', 'Ũ', s)
s = re.sub('ã', 'ã', s)
s = re.sub('ẽ', 'ẽ', s)
s = re.sub('ĩ', 'ĩ', s)
s = re.sub('õ', 'õ', s)
s = re.sub('ũ', 'ũ', s)
words=s.split(' ')
for i in range(len(words)):
# trithongs
#if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
# dipthongs/glides
#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
# spelling errors
"""
TODO: replace with dictionary to map
"""
if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in':
words[i] = words[i].replace('in', 'iin')
elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin':
words[i] = words[i].replace('hin', 'hiin')
s = ' '.join(words)
s = re.sub('uai', 'uwai', s)
return s
def convert_script(input_script, output_script, input_text):
print (input_text)
epi = epitran.Epitran(input_script)
# initial steps for asterisk script
if (input_script == 'asterisk'):
# replaces non-word-initial 'R's with 'rh' for Epitran processing
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
# non-word-initial/final hyphens and apostrophes/single quotes
input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text)
# remove word final y/h
input_text = re.sub(r'[yh]\b', '', input_text)
# double every single j
input_text = re.sub('j', 'jj', input_text)
input_text = re.sub('J', 'Jj', input_text)
input_text = re.sub('jjjj', 'jj', input_text)
input_text = re.sub('jjj', 'j', input_text)
#print (input_text)
lines = input_text.split('\n')
output_text = ''
for line in lines:
# store indices for capitalized words (will assume only first letter is capitalized
# but check for punctuation)
words = line.split()
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
capital_quote_indices = [i for i, word in enumerate(words)
if word and
word[0] in ('\"', '“', '\'', '’') and
len(word) > 1 and # check if quotation mark has a space after it
word[1].isupper()
]
#print (capital_indices)
#print (capital_quotes_)
#print (f'Before epitran: {line}')
grapheme_text = epi.transliterate(line)
#print (f'After epitran: {grapheme_text}')
if output_script == 'rhg-roheng-old':
inter_text = to_roheng_old(grapheme_text)
elif output_script == 'rhg-lroh':
inter_text = to_lroh(grapheme_text)
elif output_script == 'rhg-roheng':
inter_text = to_roheng(grapheme_text)
#print (inter_text)
# reapply capitalization
words = inter_text.split()
for i in capital_indices:
if i < len(words):
words[i] = words[i].capitalize()
for i in capital_quote_indices:
if i < len(words):
if len(words[i]) > 1:
words[i] = words[i][0] + words[i][1].upper() + words[i][2:]
output_line = ' '.join(words)
output_text = output_text + output_line + '\n'
#print (output_text + '\n##################################################\n')
return output_text.strip()