micahg commited on
Commit
96021c2
1 Parent(s): c07120f

Added script that uses asterisks for nasalization, etc.

Browse files
app.py CHANGED
@@ -1,15 +1,15 @@
1
  import gradio as gr
2
  from functions import convert_script
3
- from config import scripts
4
 
5
- DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0]
6
- DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1]
7
 
8
  def process_text(input_script, output_script, input_text, uploaded_file=None):
9
  if uploaded_file is not None:
10
  input_text = uploaded_file.decode("utf-8")
11
 
12
- output_text = convert_script(scripts[input_script], scripts[output_script], input_text)
13
 
14
  output_filename = "output.txt"
15
  with open(output_filename, "w") as file:
@@ -20,8 +20,8 @@ def process_text(input_script, output_script, input_text, uploaded_file=None):
20
  with gr.Blocks(title="Rohingya Script Converter") as page:
21
  gr.Markdown("## Rohingya Script Converter")
22
  with gr.Row():
23
- input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
24
- output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
25
  with gr.Row():
26
  input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
27
  output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
 
1
  import gradio as gr
2
  from functions import convert_script
3
+ from config import input_scripts, output_scripts
4
 
5
+ DEFAULT_INPUT_SCRIPT = list(input_scripts.keys())[0]
6
+ DEFAULT_OUTPUT_SCRIPT = list(output_scripts.keys())[1]
7
 
8
  def process_text(input_script, output_script, input_text, uploaded_file=None):
9
  if uploaded_file is not None:
10
  input_text = uploaded_file.decode("utf-8")
11
 
12
+ output_text = convert_script(input_scripts[input_script], output_scripts[output_script], input_text)
13
 
14
  output_filename = "output.txt"
15
  with open(output_filename, "w") as file:
 
20
  with gr.Blocks(title="Rohingya Script Converter") as page:
21
  gr.Markdown("## Rohingya Script Converter")
22
  with gr.Row():
23
+ input_script = gr.Dropdown(label="Choose the input script:", choices=input_scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
24
+ output_script = gr.Dropdown(label="Choose the output script:", choices=output_scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
25
  with gr.Row():
26
  input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
27
  output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
config.py CHANGED
@@ -1,5 +1,12 @@
1
- scripts = {
2
  'LearnRohingya':'rhg-lroh',
3
  'Rohingyalish':'rhg-roheng',
4
- 'Rohingyalish (old)':'rhg-roheng-old'
 
5
  }
 
 
 
 
 
 
 
1
+ input_scripts = {
2
  'LearnRohingya':'rhg-lroh',
3
  'Rohingyalish':'rhg-roheng',
4
+ 'Rohingyalish (old)':'rhg-roheng-old',
5
+ 'Asterisk':'asterisk'
6
  }
7
+
8
+ output_scripts = {
9
+ 'LearnRohingya':'rhg-lroh',
10
+ 'Rohingyalish':'rhg-roheng',
11
+ 'Rohingyalish (old)':'rhg-roheng-old'
12
+ }
epitran/data/map/asterisk.csv ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Orth,Phon
2
+ b,b
3
+ d,d
4
+ f,f
5
+ g,g
6
+ h,h
7
+ j,d͡ʒ
8
+ k,k
9
+ l,l
10
+ m,m
11
+ n,n
12
+ p,p
13
+ q,q
14
+ r,ɾ
15
+ s,s
16
+ t,t
17
+ v,v
18
+ w,w
19
+ y,j
20
+ z,z
21
+ dh,ɖ
22
+ th,ʈ
23
+ sh,ʃ
24
+ rh,ɽ
25
+ a,ɑ
26
+ e,e
27
+ i,i
28
+ o,ɔ
29
+ u,u
epitran/data/post/asterisk.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ɑ̃ɑ -> ɑ̃ː / _
2
+ ɑɑ̃ -> ɑ̃ː / _
3
+ ɑɑ -> ɑː / _
4
+
5
+ ẽe -> ẽː / _
6
+ eẽ -> ẽː / _
7
+ ee -> eː / _
8
+
9
+ ĩi -> ĩː / _
10
+ iĩ -> ĩː / _
11
+ ii -> iː / _
12
+
13
+ ɔ̃ɔ -> ɔ̃ː / _
14
+ ɔɔ̃ -> ɔ̃ː / _
15
+ ɔɔ -> ɔː / _
16
+
17
+ ũu -> ũː / _
18
+ uũ -> ũː / _
19
+ uu -> uː / _
epitran/data/pre/asterisk.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::vowel:: = a|e|i|o|u
2
+ ::consonant:: = b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|y|z
3
+
4
+ % remove stress marks
5
+ á -> a / _
6
+ é -> e / _
7
+ í -> i / _
8
+ ó -> o / _
9
+ ú -> u / _
10
+ ' -> 0 / _
11
+
12
+ % vowel glides
13
+ w -> 0 / (u|u\*) _ (a|o|e)
14
+ y -> 0 / (i|i\*) _ (a|e|o|u)
15
+
16
+ % nasalization
17
+ a\* -> ɑ̃ / _
18
+ e\* -> ẽ / _
19
+ i\* -> ĩ / _
20
+ o\* -> ɔ̃ / _
21
+ u\* -> ũ / _
functions.py CHANGED
@@ -1,4 +1,5 @@
1
  import epitran
 
2
 
3
  def to_lroh(s):
4
  s = s.replace('ɖ', 'ḍ')
@@ -9,13 +10,13 @@ def to_lroh(s):
9
  s = s.replace('j', 'y')
10
  s = s.replace('d͡ʒ', 'j')
11
 
12
- s = s.replace('ɑ̃ː', 'ɑ̃ɑ')
13
  s = s.replace('ɑː', 'ɑɑ')
14
  s = s.replace('ẽː', 'eẽ')
15
  s = s.replace('eː', 'ee')
16
  s = s.replace('ĩː', 'iĩ')
17
  s = s.replace('iː', 'ii')
18
- s = s.replace('ɔ̃ː', 'ɔ̃ɔ')
19
  s = s.replace('ɔː', 'ɔɔ')
20
  s = s.replace('ũː', 'uũ')
21
  s = s.replace('uː', 'uu')
@@ -74,13 +75,13 @@ def to_roheng(s):
74
  s = s.replace('j', 'y')
75
  s = s.replace('d͡ʒ', 'j')
76
 
77
- s = s.replace('ɑ̃ː', 'ɑ̃ɑ')
78
  s = s.replace('ɑː', 'ɑɑ')
79
  s = s.replace('ẽː', 'eẽ')
80
  s = s.replace('eː', 'ee')
81
  s = s.replace('ĩː', 'iĩ')
82
  s = s.replace('iː', 'ii')
83
- s = s.replace('ɔ̃ː', 'ɔ̃ɔ')
84
  s = s.replace('ɔː', 'ɔɔ')
85
  s = s.replace('ũː', 'uũ')
86
  s = s.replace('uː', 'uu')
@@ -96,6 +97,12 @@ def to_roheng(s):
96
  def convert_script(input_script, output_script, input_text):
97
  epi = epitran.Epitran(input_script)
98
 
 
 
 
 
 
 
99
  # store indices for capitalized words (will assume only first letter is capitalized)
100
  words = input_text.split()
101
  capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
 
1
  import epitran
2
+ import re
3
 
4
  def to_lroh(s):
5
  s = s.replace('ɖ', 'ḍ')
 
10
  s = s.replace('j', 'y')
11
  s = s.replace('d͡ʒ', 'j')
12
 
13
+ s = s.replace('ɑ̃ː', 'ɑɑ̃')
14
  s = s.replace('ɑː', 'ɑɑ')
15
  s = s.replace('ẽː', 'eẽ')
16
  s = s.replace('eː', 'ee')
17
  s = s.replace('ĩː', 'iĩ')
18
  s = s.replace('iː', 'ii')
19
+ s = s.replace('ɔ̃ː', 'ɔɔ̃')
20
  s = s.replace('ɔː', 'ɔɔ')
21
  s = s.replace('ũː', 'uũ')
22
  s = s.replace('uː', 'uu')
 
75
  s = s.replace('j', 'y')
76
  s = s.replace('d͡ʒ', 'j')
77
 
78
+ s = s.replace('ɑ̃ː', 'ɑɑ̃')
79
  s = s.replace('ɑː', 'ɑɑ')
80
  s = s.replace('ẽː', 'eẽ')
81
  s = s.replace('eː', 'ee')
82
  s = s.replace('ĩː', 'iĩ')
83
  s = s.replace('iː', 'ii')
84
+ s = s.replace('ɔ̃ː', 'ɔɔ̃')
85
  s = s.replace('ɔː', 'ɔɔ')
86
  s = s.replace('ũː', 'uũ')
87
  s = s.replace('uː', 'uu')
 
97
  def convert_script(input_script, output_script, input_text):
98
  epi = epitran.Epitran(input_script)
99
 
100
+ # initial step to account for 'R' in the asterisk step -
101
+ #replaces non-word initial 'R's with 'rh' for Epitran processing
102
+ if (input_script == 'asterisk'):
103
+ input_text = re.sub(r'(?<=\B)R', 'rh', input_text, flags=re.IGNORECASE)
104
+ input_text = input_text.replace('*R', '*rh')
105
+
106
  # store indices for capitalized words (will assume only first letter is capitalized)
107
  words = input_text.split()
108
  capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
output.txt CHANGED
@@ -1 +1 @@
1
- Tandil hodeyan fukorzoria odeyan tuañra hoi faro ne? Tandilor fuk okkol bicci coñrode etolla boli suke daha no zar. Fuk iin dase nakortun mukortun galortun goli ore gā buture bari fecwamaze sai zagyoi. E fuk iin ekzonortu ar ekzonor hañse fara. Ekbar hacile ekbar añcaile tuañrar galortu nakortu kuti kuti fuk okkol bairo. Zehon fua waye acaierar fuk iin dase mayafua mukotmade sai foje. Mayafuar mukotun fuk iin nakbai galbai goli bari feshwa maze fuk iin saizargoi. Raito maze fuk iin bariore bicci oigyoi etolla boli mayafua biaramya oigyoi ebala. Toile fuk iin fara mesal ibar mukormaze fuk ase bafe dojerar mayafuar mukortun fuk iin faraye bafor atot. Toi fuk hode iin jinic okkolortuno fara. Fun maye loie martuno fuk faragyoi. Mayafuatun fuk bariore bafmar erio faraye. Hodinbade gororgucitiañtun biaram oigioi. Sefotuno fuk iin fara. Ho sor hombol okkolotuno fuk fara. Endoila gori fuk fara. Gororgucitiañ okkolotun fuk faraiore ar biaram oigyoi. Itara oinor zonorio biaramyañ gori falai fare. Fukor babute zani fari bade itarar solaifira adot hasolot boduli falaye. Maye gura baica muk fuñsibolla alada sap hor estamal gore. Ibaye sabondiore ho sor hombol okkol due. Roitormare fuadde belor gorome fuk more. Ya bafio sef falaite sai si ti fala. Maince zere ni ki aça uça no gore eçe sef falar. Fukor babute zani fari at duio falaibade muk fuñse. Etola funormare fuk no fara ar funormare fuk nai. Bafe biaramyañ zerfuar muk dori bade atore sabondiore dui fala nizor muk doribar age. Itara ehon begune gonno gonno tarar hatore due. Etolla yanor zoria biaramwa bicci hom oizargyoi itaratun. Biaramyañ fuawayo fukor bafote zani fari bade ebola ite añcaile hacile nakor mukor guri de. Etola mayafua eçe fuk no fara. Etola mayafuartun gorormanco eçe fuk no fara. Itarar solarfira adot hasilot boduli falaye. Itara zodi etorika mozin solile ar manile biaram ciaram no oibo: hacile ar añcaile nakore mukore guri dio! Hato kolore sabondiore bicci bicci gori duio! Añça uça no gorede zagamare sef falayo! Cattat made homot ekbar ho sor hombol okkolore duio! Hamica hombol okkolore roitormare fua dio. Iinore manile fuk okkol no faraibo ar hono biaram ciaram no oibo.
 
1
+ Pedṛottun ekkan moṭosaiṛkol ase aaṛ hite iian sola beci foson goṛe. Uggwa insaanoṛ motoṛ kuci goṛi bolla Ameṛicar maincoṛ ḍoõr uggwa ḍai White Houseoṛ baṛkule en goṛi zoma oie, zenḍilla hitaṛa Football kela zitile kuci goitto. Aãr aãr ḍen ḍakoṛ ãṛuot nofuṛede henḍilla kissu kiṛim lagaiṛ. Aãi nosinide maya-fuaindoṛe zetobaṛ appa di hetobaṛ aãttu ṭandi-zoṛ oito boli, aãttu maya-fuaindoṛ hono baiggo notakito. Aãr ham oilde jinic in zanon. Soledde ṭaimoṛo owṛe, haṛhana hode iin ekkan ocantiṛ zagaṛ cundoijja mesal. Gaṛi gane gonḍyae ebbeṛe beci fãs kilomiṭaṛ baingil. Zaṛa soyi muhabbot goṛoia ase hitaṛa hitaṛaṛ kuciṛ hota hono-din noleke. Ceila loi aãi loi fuṛan fuãjja. Maya-fuain gaṛi soloia lagataṛ hotoṛa. Hibaie aãre kissu mozaṛ hota hoil aaṛ mozaṛ gana gail ugga biaṛaimmaṛ ḍakottu. Zodi tũi zõlot maze ballukoṛ fãzot foṛi zogoi, tuãr sobse behetoṛ toṛika oilde meṛit neṛi moṛi gio goi fan goṛon.