rhg-script-converter-ui / functions.py
micahg's picture
file dnd fix; line output fix; vowel handling
e2e57a3
raw
history blame
5.31 kB
import epitran
import re
def to_lroh(s):
s = s.replace('ɖ', 'ḍ')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ṛ')
s = s.replace('ʃ', 'š')
s = s.replace('ʈ', 'ṭ')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'ɑɑ̃')
s = s.replace('ɑː', 'ɑɑ')
s = s.replace('ẽː', 'eẽ')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iĩ')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔɔ̃')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uũ')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'ã')
s = s.replace('ɔ̃', 'õ')
s = s.replace('ɔ', 'o')
return s
def to_roheng_old(s):
s = s.replace('ɖ', 'dh')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ç')
s = s.replace('ʃ', 'c')
s = s.replace('ʈ', 'th')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'aña')
s = s.replace('ɑː', 'aa')
s = s.replace('ẽː', 'eñe')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iñi')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔñɔ')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uñu')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'añ')
s = s.replace('ɔ̃', 'oñ')
s = s.replace('ɔ', 'o')
s = s.replace('ã', 'añ') # composite
s = s.replace('ã', 'añ') # two point codes
s = s.replace('ẽ', 'eñ')
s = s.replace('ẽ', 'eñ')
s = s.replace('ĩ', 'iñ')
s = s.replace('ĩ', 'iñ')
s = s.replace('ũ', 'uñ')
s = s.replace('ũ', 'uñ')
return s
def to_roheng(s):
s = s.replace('ɖ', 'ḍ')
s = s.replace('ɾ', 'r')
s = s.replace('ɽ', 'ṛ')
s = s.replace('ʃ', 'c')
s = s.replace('ʈ', 'ṭ')
s = s.replace('j', 'y')
s = s.replace('d͡ʒ', 'j')
s = s.replace('ɑ̃ː', 'ɑɑ̃')
s = s.replace('ɑː', 'ɑɑ')
s = s.replace('ẽː', 'eẽ')
s = s.replace('eː', 'ee')
s = s.replace('ĩː', 'iĩ')
s = s.replace('iː', 'ii')
s = s.replace('ɔ̃ː', 'ɔɔ̃')
s = s.replace('ɔː', 'ɔɔ')
s = s.replace('ũː', 'uũ')
s = s.replace('uː', 'uu')
s = s.replace('ɑ', 'a')
s = s.replace('̃ɑ', 'ã')
s = s.replace('ɔ̃', 'õ')
s = s.replace('ɔ', 'o')
"""
glides/dipthongs/trithongs
"""
# insert 'y' after i if it is followed by any vowel
#s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
#s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
words=s.split(' ')
for i in range(len(words)):
# trithongs
if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
# dipthongs/glides
elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
temp_s = ' '.join(words)
return temp_s
def convert_script(input_script, output_script, input_text):
#print (input_text)
epi = epitran.Epitran(input_script)
# initial step to account for 'R' in the asterisk step -
# replaces non-word initial 'R's with 'rh' for Epitran processing
if (input_script == 'asterisk'):
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
#print (input_text)
lines = input_text.split('\n')
output_text = ''
for line in lines:
# store indices for capitalized words (will assume only first letter is capitalized)
words = line.split()
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
#print (capital_indices)
grapheme_text = epi.transliterate(line)
#print (grapheme_text)
if output_script == 'rhg-roheng-old':
inter_text = to_roheng_old(grapheme_text)
elif output_script == 'rhg-lroh':
inter_text = to_lroh(grapheme_text)
elif output_script == 'rhg-roheng':
inter_text = to_roheng(grapheme_text)
#print (inter_text)
# reapply capitalization
words = inter_text.split()
for i in capital_indices:
if i < len(words):
words[i] = words[i].capitalize()
output_line = ' '.join(words)
output_text = output_text + output_line + '\n'
#print (output_text + '\n##################################################\n')
return output_text.strip()
# Issues:
#
# ou
# glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?)
# stress