Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

rhg-script-converter-ui / functions.py

micahg

spelling edits

72b7374 6 months ago

raw

history blame contribute delete

No virus

7.02 kB

	import epitran
	import re
	import string

	def to_lroh(s):
	s = s.replace('ɖ', 'ḍ')
	s = s.replace('ɾ', 'r')
	s = s.replace('ɽ', 'ṛ')
	s = s.replace('ʃ', 'š')
	s = s.replace('ʈ', 'ṭ')
	s = s.replace('j', 'y')
	s = s.replace('d͡ʒ', 'j')

	s = s.replace('ɑ̃ː', 'ɑɑ̃')
	s = s.replace('ɑː', 'ɑɑ')
	s = s.replace('ẽː', 'eẽ')
	s = s.replace('eː', 'ee')
	s = s.replace('ĩː', 'iĩ')
	s = s.replace('iː', 'ii')
	s = s.replace('ɔ̃ː', 'ɔɔ̃')
	s = s.replace('ɔː', 'ɔɔ')
	s = s.replace('ũː', 'uũ')
	s = s.replace('uː', 'uu')

	s = s.replace('ɑ', 'a')
	s = s.replace('̃ɑ', 'ã')

	s = s.replace('ɔ̃', 'õ')
	s = s.replace('ɔ', 'o')

	# step to standardize all nasalized vowels as precomposed characters
	s = re.sub('ã', 'ã', s)
	s = re.sub('ẽ', 'ẽ', s)
	s = re.sub('ĩ', 'ĩ', s)
	s = re.sub('õ', 'õ', s)
	s = re.sub('ũ', 'ũ', s)

	return s

	def to_roheng_old(s):
	s = s.replace('ɖ', 'dh')
	s = s.replace('ɾ', 'r')
	s = s.replace('ɽ', 'ç')
	s = s.replace('ʃ', 'c')
	s = s.replace('ʈ', 'th')
	s = s.replace('j', 'y')
	s = s.replace('d͡ʒ', 'j')

	s = s.replace('ɑ̃ː', 'aña')
	s = s.replace('ɑː', 'aa')
	s = s.replace('ẽː', 'eñe')
	s = s.replace('eː', 'ee')
	s = s.replace('ĩː', 'iñi')
	s = s.replace('iː', 'ii')
	s = s.replace('ɔ̃ː', 'ɔñɔ')
	s = s.replace('ɔː', 'ɔɔ')
	s = s.replace('ũː', 'uñu')
	s = s.replace('uː', 'uu')

	s = s.replace('ɑ', 'a')
	s = s.replace('̃ɑ', 'añ')

	s = s.replace('ɔ̃', 'oñ')
	s = s.replace('ɔ', 'o')

	s = s.replace('ã', 'añ') # composite
	s = s.replace('ã', 'añ') # two point codes
	s = s.replace('ẽ', 'eñ')
	s = s.replace('ẽ', 'eñ')
	s = s.replace('ĩ', 'iñ')
	s = s.replace('ĩ', 'iñ')
	s = s.replace('ũ', 'uñ')
	s = s.replace('ũ', 'uñ')

	return s

	def to_roheng(s):
	s = s.replace('ɖ', 'ḍ')
	s = s.replace('ɾ', 'r')
	s = s.replace('ɽ', 'ṛ')
	s = s.replace('ʃ', 'c')
	s = s.replace('ʈ', 'ṭ')
	s = s.replace('j', 'y')
	s = s.replace('d͡ʒ', 'j')

	s = s.replace('ɑ̃ː', 'ɑɑ̃')
	s = s.replace('ɑː', 'ɑɑ')
	s = s.replace('ẽː', 'eẽ')
	s = s.replace('eː', 'ee')
	s = s.replace('ĩː', 'iĩ')
	s = s.replace('iː', 'ii')
	s = s.replace('ɔ̃ː', 'ɔɔ̃')
	s = s.replace('ɔː', 'ɔɔ')
	s = s.replace('ũː', 'uũ')
	s = s.replace('uː', 'uu')

	s = s.replace('ɑ', 'a')
	s = s.replace('̃ɑ', 'ã')

	s = s.replace('ɔ̃', 'õ')
	s = s.replace('ɔ', 'o')

	"""
	glides/dipthongs/trithongs
	"""

	# step to standardize all nasalized vowels as precomposed characters
	s = re.sub('Ã', 'Ã', s)
	s = re.sub('Ẽ', 'Ẽ', s)
	s = re.sub('Ĩ', 'Ĩ', s)
	s = re.sub('Õ', 'Õ', s)
	s = re.sub('Ũ', 'Ũ', s)
	s = re.sub('ã', 'ã', s)
	s = re.sub('ẽ', 'ẽ', s)
	s = re.sub('ĩ', 'ĩ', s)
	s = re.sub('õ', 'õ', s)
	s = re.sub('ũ', 'ũ', s)

	words=s.split(' ')

	for i in range(len(words)):
	# trithongs
	#if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
	words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
	words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
	# dipthongs/glides
	#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
	words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])

	# spelling errors
	"""
	TODO: replace with dictionary to map
	"""
	if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in':
	words[i] = words[i].replace('in', 'iin')
	elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin':
	words[i] = words[i].replace('hin', 'hiin')

	s = ' '.join(words)

	s = re.sub('uai', 'uwai', s)

	return s

	def convert_script(input_script, output_script, input_text):

	print (input_text)

	epi = epitran.Epitran(input_script)

	# initial steps for asterisk script
	if (input_script == 'asterisk'):
	# replaces non-word-initial 'R's with 'rh' for Epitran processing
	input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
	input_text = input_text.replace('R', 'rh') # additional step for '*' since it is treated as a word boundary
	# non-word-initial/final hyphens and apostrophes/single quotes
	input_text = re.sub(r'(?<=[\w*])[\’\'-](?=\w)', ' ', input_text)
	# remove word final y/h
	input_text = re.sub(r'[yh]\b', '', input_text)
	# double every single j
	input_text = re.sub('j', 'jj', input_text)
	input_text = re.sub('J', 'Jj', input_text)
	input_text = re.sub('jjjj', 'jj', input_text)
	input_text = re.sub('jjj', 'j', input_text)

	#print (input_text)
	lines = input_text.split('\n')
	output_text = ''

	for line in lines:
	# store indices for capitalized words (will assume only first letter is capitalized
	# but check for punctuation)
	words = line.split()
	capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
	capital_quote_indices = [i for i, word in enumerate(words)
	if word and
	word[0] in ('\"', '“', '\'', '’') and
	len(word) > 1 and # check if quotation mark has a space after it
	word[1].isupper()
	]
	#print (capital_indices)
	#print (capital_quotes_)

	#print (f'Before epitran: {line}')

	grapheme_text = epi.transliterate(line)

	#print (f'After epitran: {grapheme_text}')

	if output_script == 'rhg-roheng-old':
	inter_text = to_roheng_old(grapheme_text)
	elif output_script == 'rhg-lroh':
	inter_text = to_lroh(grapheme_text)
	elif output_script == 'rhg-roheng':
	inter_text = to_roheng(grapheme_text)

	#print (inter_text)

	# reapply capitalization
	words = inter_text.split()
	for i in capital_indices:
	if i < len(words):
	words[i] = words[i].capitalize()
	for i in capital_quote_indices:
	if i < len(words):
	if len(words[i]) > 1:
	words[i] = words[i][0] + words[i][1].upper() + words[i][2:]

	output_line = ' '.join(words)
	output_text = output_text + output_line + '\n'

	#print (output_text + '\n##################################################\n')

	return output_text.strip()