Spaces:

ILAD
/

rhg-script-converter-ui

Running

App Files Files Community

micahg commited on Dec 14, 2023

Commit

96021c2

•

1 Parent(s): c07120f

Added script that uses asterisks for nasalization, etc.

Browse files

Files changed (7) hide show

app.py +6 -6
config.py +9 -2
epitran/data/map/asterisk.csv +29 -0
epitran/data/post/asterisk.txt +19 -0
epitran/data/pre/asterisk.txt +21 -0
functions.py +11 -4
output.txt +1 -1

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import gradio as gr
 from functions import convert_script
-from config import scripts
-DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0]
-DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1]
 def process_text(input_script, output_script, input_text, uploaded_file=None):
     if uploaded_file is not None:
         input_text = uploaded_file.decode("utf-8")
-    output_text = convert_script(scripts[input_script], scripts[output_script], input_text)
     output_filename = "output.txt"
     with open(output_filename, "w") as file:
@@ -20,8 +20,8 @@ def process_text(input_script, output_script, input_text, uploaded_file=None):
 with gr.Blocks(title="Rohingya Script Converter") as page:
     gr.Markdown("## Rohingya Script Converter")
     with gr.Row():
-        input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
-        output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
     with gr.Row():
         input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
         output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)

 import gradio as gr
 from functions import convert_script
+from config import input_scripts, output_scripts
+DEFAULT_INPUT_SCRIPT = list(input_scripts.keys())[0]
+DEFAULT_OUTPUT_SCRIPT = list(output_scripts.keys())[1]
 def process_text(input_script, output_script, input_text, uploaded_file=None):
     if uploaded_file is not None:
         input_text = uploaded_file.decode("utf-8")
+    output_text = convert_script(input_scripts[input_script], output_scripts[output_script], input_text)
     output_filename = "output.txt"
     with open(output_filename, "w") as file:
 with gr.Blocks(title="Rohingya Script Converter") as page:
     gr.Markdown("## Rohingya Script Converter")
     with gr.Row():
+        input_script = gr.Dropdown(label="Choose the input script:", choices=input_scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
+        output_script = gr.Dropdown(label="Choose the output script:", choices=output_scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
     with gr.Row():
         input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
         output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)

config.py CHANGED Viewed

@@ -1,5 +1,12 @@
-scripts = {
 	'LearnRohingya':'rhg-lroh',
 	'Rohingyalish':'rhg-roheng',
-	'Rohingyalish (old)':'rhg-roheng-old'
 	}

+input_scripts = {
 	'LearnRohingya':'rhg-lroh',
 	'Rohingyalish':'rhg-roheng',
+	'Rohingyalish (old)':'rhg-roheng-old',
+	'Asterisk':'asterisk'
 	}
+output_scripts = {
+	'LearnRohingya':'rhg-lroh',
+	'Rohingyalish':'rhg-roheng',
+	'Rohingyalish (old)':'rhg-roheng-old'
+	}

epitran/data/map/asterisk.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+Orth,Phon
+b,b
+d,d
+f,f
+g,g
+h,h
+j,d͡ʒ
+k,k
+l,l
+m,m
+n,n
+p,p
+q,q
+r,ɾ
+s,s
+t,t
+v,v
+w,w
+y,j
+z,z
+dh,ɖ
+th,ʈ
+sh,ʃ
+rh,ɽ
+a,ɑ
+e,e
+i,i
+o,ɔ
+u,u

epitran/data/post/asterisk.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+ɑ̃ɑ -> ɑ̃ː / _
+ɑɑ̃ -> ɑ̃ː / _
+ɑɑ -> ɑː / _
+ẽe -> ẽː / _
+eẽ -> ẽː / _
+ee -> eː / _
+ĩi -> ĩː / _
+iĩ -> ĩː / _
+ii -> iː / _
+ɔ̃ɔ -> ɔ̃ː / _
+ɔɔ̃ -> ɔ̃ː / _
+ɔɔ -> ɔː / _
+ũu -> ũː / _
+uũ -> ũː / _
+uu -> uː / _

epitran/data/pre/asterisk.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+::vowel:: = a|e|i|o|u
+::consonant:: = b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|y|z
+% remove stress marks
+á -> a / _
+é -> e / _
+í -> i / _
+ó -> o / _
+ú -> u / _
+' -> 0 / _
+% vowel glides
+w -> 0 / (u|u\*) _ (a|o|e)
+y -> 0 / (i|i\*) _ (a|e|o|u)
+% nasalization
+a\* -> ɑ̃ / _
+e\* -> ẽ / _
+i\* -> ĩ / _
+o\* -> ɔ̃ / _
+u\* -> ũ / _

functions.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import epitran
 def to_lroh(s):
     s = s.replace('ɖ', 'ḍ')
@@ -9,13 +10,13 @@ def to_lroh(s):
     s = s.replace('j', 'y')
     s = s.replace('d͡ʒ', 'j')
-    s = s.replace('ɑ̃ː', 'ɑ̃ɑ')
     s = s.replace('ɑː', 'ɑɑ')
     s = s.replace('ẽː', 'eẽ')
     s = s.replace('eː', 'ee')
     s = s.replace('ĩː', 'iĩ')
     s = s.replace('iː', 'ii')
-    s = s.replace('ɔ̃ː', 'ɔ̃ɔ')
     s = s.replace('ɔː', 'ɔɔ')
     s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
@@ -74,13 +75,13 @@ def to_roheng(s):
     s = s.replace('j', 'y')
     s = s.replace('d͡ʒ', 'j')
-    s = s.replace('ɑ̃ː', 'ɑ̃ɑ')
     s = s.replace('ɑː', 'ɑɑ')
     s = s.replace('ẽː', 'eẽ')
     s = s.replace('eː', 'ee')
     s = s.replace('ĩː', 'iĩ')
     s = s.replace('iː', 'ii')
-    s = s.replace('ɔ̃ː', 'ɔ̃ɔ')
     s = s.replace('ɔː', 'ɔɔ')
     s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
@@ -96,6 +97,12 @@ def to_roheng(s):
 def convert_script(input_script, output_script, input_text):
     epi = epitran.Epitran(input_script)
     # store indices for capitalized words (will assume only first letter is capitalized)
     words = input_text.split()
     capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]

 import epitran
+import re
 def to_lroh(s):
     s = s.replace('ɖ', 'ḍ')
     s = s.replace('j', 'y')
     s = s.replace('d͡ʒ', 'j')
+    s = s.replace('ɑ̃ː', 'ɑɑ̃')
     s = s.replace('ɑː', 'ɑɑ')
     s = s.replace('ẽː', 'eẽ')
     s = s.replace('eː', 'ee')
     s = s.replace('ĩː', 'iĩ')
     s = s.replace('iː', 'ii')
+    s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
     s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
     s = s.replace('j', 'y')
     s = s.replace('d͡ʒ', 'j')
+    s = s.replace('ɑ̃ː', 'ɑɑ̃')
     s = s.replace('ɑː', 'ɑɑ')
     s = s.replace('ẽː', 'eẽ')
     s = s.replace('eː', 'ee')
     s = s.replace('ĩː', 'iĩ')
     s = s.replace('iː', 'ii')
+    s = s.replace('ɔ̃ː', 'ɔɔ̃')
     s = s.replace('ɔː', 'ɔɔ')
     s = s.replace('ũː', 'uũ')
     s = s.replace('uː', 'uu')
 def convert_script(input_script, output_script, input_text):
     epi = epitran.Epitran(input_script)
+    # initial step to account for 'R' in the asterisk step -
+    #replaces non-word initial 'R's with 'rh' for Epitran processing
+    if (input_script == 'asterisk'):
+        input_text = re.sub(r'(?<=\B)R', 'rh', input_text, flags=re.IGNORECASE)
+        input_text = input_text.replace('*R', '*rh')
     # store indices for capitalized words (will assume only first letter is capitalized)
     words = input_text.split()
     capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]

output.txt CHANGED Viewed

@@ -1 +1 @@

- ~~Tandil~~ ~~hodeyan~~ ~~fukorzoria~~ ~~odeyan~~ ~~tuañra~~ ~~hoi~~ ~~faro~~ ~~ne?~~ ~~Tandilor~~ ~~fuk~~ ~~okkol bicci coñrode etolla boli suke daha no zar~~. ~~Fuk~~ ~~iin~~ ~~dase~~ ~~nakortun~~ ~~mukortun~~ ~~galortun~~ ~~goli~~ ~~ore~~ ~~gā~~ ~~buture~~ ~~bari~~ ~~fecwamaze~~ ~~sai~~ ~~zagyoi.~~ E ~~fuk~~ ~~iin~~ ~~ekzonortu~~ ar ~~ekzonor~~ ~~hañse~~ ~~fara.~~ ~~Ekbar~~ ~~hacile~~ ~~ekbar añcaile tuañrar galortu nakortu kuti kuti fuk okkol bairo~~. ~~Zehon~~ ~~fua~~ ~~waye~~ ~~acaierar~~ ~~fuk~~ ~~iin~~ ~~dase~~ ~~mayafua~~ ~~mukotmade~~ ~~sai~~ ~~foje.~~ ~~Mayafuar~~ ~~mukotun~~ ~~fuk~~ ~~iin~~ ~~nakbai~~ ~~galbai~~ ~~goli~~ ~~bari~~ ~~feshwa~~ ~~maze fuk iin saizargoi. Raito maze fuk iin bariore bicci oigyoi etolla~~ boli ~~mayafua~~ ~~biaramya~~ ~~oigyoi~~ ~~ebala.~~ ~~Toile fuk iin fara mesal ibar mukormaze fuk ase bafe dojerar mayafuar mukortun fuk iin faraye bafor atot~~. ~~Toi~~ ~~fuk~~ ~~hode~~ ~~iin~~ jinic ~~okkolortuno~~ ~~fara~~. ~~Fun~~ ~~maye~~ ~~loie~~ ~~martuno~~ ~~fuk~~ ~~faragyoi. Mayafuatun fuk bariore bafmar erio faraye. Hodinbade gororgucitiañtun biaram oigioi. Sefotuno fuk~~ iin ~~fara.~~ Ho ~~sor~~ ~~hombol~~ ~~okkolotuno fuk fara~~. ~~Endoila~~ ~~gori~~ ~~fuk~~ ~~fara.~~ ~~Gororgucitiañ~~ ~~okkolotun~~ ~~fuk~~ ~~faraiore ar biaram oigyoi~~. ~~Itara~~ ~~oinor~~ ~~zonorio~~ ~~biaramyañ~~ ~~gori~~ ~~falai~~ ~~fare.~~ ~~Fukor~~ ~~babute~~ ~~zani~~ ~~fari bade itarar solaifira adot hasolot boduli falaye~~. ~~Maye~~ ~~gura~~ ~~baica~~ ~~muk~~ ~~fuñsibolla~~ ~~alada sap hor estamal gore~~. ~~Ibaye~~ ~~sabondiore~~ ho ~~sor~~ ~~hombol okkol due~~. ~~Roitormare~~ ~~fuadde~~ ~~belor~~ ~~gorome~~ ~~fuk~~ ~~more.~~ Ya ~~bafio~~ ~~sef~~ ~~falaite~~ ~~sai~~ si ~~ti fala~~. ~~Maince~~ ~~zere~~ ni ki ~~aça~~ ~~uça~~ no ~~gore~~ ~~eçe~~ ~~sef~~ ~~falar.~~ ~~Fukor~~ ~~babute~~ ~~zani~~ ~~fari~~ at ~~duio~~ ~~falaibade~~ ~~muk~~ ~~fuñse~~. Etola funormare fuk no fara ar funormare fuk nai. Bafe biaramyañ zerfuar muk dori bade atore sabondiore dui fala nizor muk doribar age. Itara ehon begune gonno gonno tarar hatore due. Etolla yanor zoria biaramwa bicci hom oizargyoi itaratun. Biaramyañ fuawayo fukor bafote zani fari bade ebola ite añcaile hacile nakor mukor guri de. Etola mayafua eçe fuk no fara. Etola mayafuartun gorormanco eçe fuk no fara. Itarar solarfira adot hasilot boduli falaye. Itara zodi etorika mozin solile ar manile biaram ciaram no oibo: hacile ar añcaile nakore mukore guri dio! Hato kolore sabondiore bicci bicci gori duio! Añça uça no gorede zagamare sef falayo! Cattat made homot ekbar ho sor hombol okkolore duio! Hamica hombol okkolore roitormare fua dio. Iinore manile fuk okkol no faraibo ar hono biaram ciaram no oibo.

+ Pedṛottun ekkan moṭosaiṛkol ase aaṛ hite iian sola beci foson goṛe. Uggwa insaanoṛ motoṛ kuci goṛi bolla Ameṛicar maincoṛ ḍoõr uggwa ḍai White Houseoṛ baṛkule en goṛi zoma oie, zenḍilla hitaṛa Football kela zitile kuci goitto. Aãr aãr ḍen ḍakoṛ ãṛuot nofuṛede henḍilla kissu kiṛim lagaiṛ. Aãi nosinide maya-fuaindoṛe zetobaṛ appa di hetobaṛ aãttu ṭandi-zoṛ oito boli, aãttu maya-fuaindoṛ hono baiggo notakito. Aãr ham oilde jinic in zanon. Soledde ṭaimoṛo owṛe, haṛhana hode iin ekkan ocantiṛ zagaṛ cundoijja mesal. Gaṛi gane gonḍyae ebbeṛe beci fãs kilomiṭaṛ baingil. Zaṛa soyi muhabbot goṛoia ase hitaṛa hitaṛaṛ kuciṛ hota hono-din noleke. Ceila loi aãi loi fuṛan fuãjja. Maya-fuain gaṛi soloia lagataṛ hotoṛa. Hibaie aãre kissu mozaṛ hota hoil aaṛ mozaṛ gana gail ugga biaṛaimmaṛ ḍakottu. Zodi tũi zõlot maze ballukoṛ fãzot foṛi zogoi, tuãr sobse behetoṛ toṛika oilde meṛit neṛi moṛi gio goi fan goṛon.