Spaces:
Running
Running
Added script that uses asterisks for nasalization, etc.
Browse files- app.py +6 -6
- config.py +9 -2
- epitran/data/map/asterisk.csv +29 -0
- epitran/data/post/asterisk.txt +19 -0
- epitran/data/pre/asterisk.txt +21 -0
- functions.py +11 -4
- output.txt +1 -1
app.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
from functions import convert_script
|
3 |
-
from config import
|
4 |
|
5 |
-
DEFAULT_INPUT_SCRIPT = list(
|
6 |
-
DEFAULT_OUTPUT_SCRIPT = list(
|
7 |
|
8 |
def process_text(input_script, output_script, input_text, uploaded_file=None):
|
9 |
if uploaded_file is not None:
|
10 |
input_text = uploaded_file.decode("utf-8")
|
11 |
|
12 |
-
output_text = convert_script(
|
13 |
|
14 |
output_filename = "output.txt"
|
15 |
with open(output_filename, "w") as file:
|
@@ -20,8 +20,8 @@ def process_text(input_script, output_script, input_text, uploaded_file=None):
|
|
20 |
with gr.Blocks(title="Rohingya Script Converter") as page:
|
21 |
gr.Markdown("## Rohingya Script Converter")
|
22 |
with gr.Row():
|
23 |
-
input_script = gr.Dropdown(label="Choose the input script:", choices=
|
24 |
-
output_script = gr.Dropdown(label="Choose the output script:", choices=
|
25 |
with gr.Row():
|
26 |
input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
|
27 |
output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
|
|
|
1 |
import gradio as gr
|
2 |
from functions import convert_script
|
3 |
+
from config import input_scripts, output_scripts
|
4 |
|
5 |
+
DEFAULT_INPUT_SCRIPT = list(input_scripts.keys())[0]
|
6 |
+
DEFAULT_OUTPUT_SCRIPT = list(output_scripts.keys())[1]
|
7 |
|
8 |
def process_text(input_script, output_script, input_text, uploaded_file=None):
|
9 |
if uploaded_file is not None:
|
10 |
input_text = uploaded_file.decode("utf-8")
|
11 |
|
12 |
+
output_text = convert_script(input_scripts[input_script], output_scripts[output_script], input_text)
|
13 |
|
14 |
output_filename = "output.txt"
|
15 |
with open(output_filename, "w") as file:
|
|
|
20 |
with gr.Blocks(title="Rohingya Script Converter") as page:
|
21 |
gr.Markdown("## Rohingya Script Converter")
|
22 |
with gr.Row():
|
23 |
+
input_script = gr.Dropdown(label="Choose the input script:", choices=input_scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
|
24 |
+
output_script = gr.Dropdown(label="Choose the output script:", choices=output_scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
|
25 |
with gr.Row():
|
26 |
input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
|
27 |
output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
|
config.py
CHANGED
@@ -1,5 +1,12 @@
|
|
1 |
-
|
2 |
'LearnRohingya':'rhg-lroh',
|
3 |
'Rohingyalish':'rhg-roheng',
|
4 |
-
'Rohingyalish (old)':'rhg-roheng-old'
|
|
|
5 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_scripts = {
|
2 |
'LearnRohingya':'rhg-lroh',
|
3 |
'Rohingyalish':'rhg-roheng',
|
4 |
+
'Rohingyalish (old)':'rhg-roheng-old',
|
5 |
+
'Asterisk':'asterisk'
|
6 |
}
|
7 |
+
|
8 |
+
output_scripts = {
|
9 |
+
'LearnRohingya':'rhg-lroh',
|
10 |
+
'Rohingyalish':'rhg-roheng',
|
11 |
+
'Rohingyalish (old)':'rhg-roheng-old'
|
12 |
+
}
|
epitran/data/map/asterisk.csv
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Orth,Phon
|
2 |
+
b,b
|
3 |
+
d,d
|
4 |
+
f,f
|
5 |
+
g,g
|
6 |
+
h,h
|
7 |
+
j,d͡ʒ
|
8 |
+
k,k
|
9 |
+
l,l
|
10 |
+
m,m
|
11 |
+
n,n
|
12 |
+
p,p
|
13 |
+
q,q
|
14 |
+
r,ɾ
|
15 |
+
s,s
|
16 |
+
t,t
|
17 |
+
v,v
|
18 |
+
w,w
|
19 |
+
y,j
|
20 |
+
z,z
|
21 |
+
dh,ɖ
|
22 |
+
th,ʈ
|
23 |
+
sh,ʃ
|
24 |
+
rh,ɽ
|
25 |
+
a,ɑ
|
26 |
+
e,e
|
27 |
+
i,i
|
28 |
+
o,ɔ
|
29 |
+
u,u
|
epitran/data/post/asterisk.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ɑ̃ɑ -> ɑ̃ː / _
|
2 |
+
ɑɑ̃ -> ɑ̃ː / _
|
3 |
+
ɑɑ -> ɑː / _
|
4 |
+
|
5 |
+
ẽe -> ẽː / _
|
6 |
+
eẽ -> ẽː / _
|
7 |
+
ee -> eː / _
|
8 |
+
|
9 |
+
ĩi -> ĩː / _
|
10 |
+
iĩ -> ĩː / _
|
11 |
+
ii -> iː / _
|
12 |
+
|
13 |
+
ɔ̃ɔ -> ɔ̃ː / _
|
14 |
+
ɔɔ̃ -> ɔ̃ː / _
|
15 |
+
ɔɔ -> ɔː / _
|
16 |
+
|
17 |
+
ũu -> ũː / _
|
18 |
+
uũ -> ũː / _
|
19 |
+
uu -> uː / _
|
epitran/data/pre/asterisk.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::vowel:: = a|e|i|o|u
|
2 |
+
::consonant:: = b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|y|z
|
3 |
+
|
4 |
+
% remove stress marks
|
5 |
+
á -> a / _
|
6 |
+
é -> e / _
|
7 |
+
í -> i / _
|
8 |
+
ó -> o / _
|
9 |
+
ú -> u / _
|
10 |
+
' -> 0 / _
|
11 |
+
|
12 |
+
% vowel glides
|
13 |
+
w -> 0 / (u|u\*) _ (a|o|e)
|
14 |
+
y -> 0 / (i|i\*) _ (a|e|o|u)
|
15 |
+
|
16 |
+
% nasalization
|
17 |
+
a\* -> ɑ̃ / _
|
18 |
+
e\* -> ẽ / _
|
19 |
+
i\* -> ĩ / _
|
20 |
+
o\* -> ɔ̃ / _
|
21 |
+
u\* -> ũ / _
|
functions.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import epitran
|
|
|
2 |
|
3 |
def to_lroh(s):
|
4 |
s = s.replace('ɖ', 'ḍ')
|
@@ -9,13 +10,13 @@ def to_lroh(s):
|
|
9 |
s = s.replace('j', 'y')
|
10 |
s = s.replace('d͡ʒ', 'j')
|
11 |
|
12 |
-
s = s.replace('ɑ̃ː', '
|
13 |
s = s.replace('ɑː', 'ɑɑ')
|
14 |
s = s.replace('ẽː', 'eẽ')
|
15 |
s = s.replace('eː', 'ee')
|
16 |
s = s.replace('ĩː', 'iĩ')
|
17 |
s = s.replace('iː', 'ii')
|
18 |
-
s = s.replace('ɔ̃ː', '
|
19 |
s = s.replace('ɔː', 'ɔɔ')
|
20 |
s = s.replace('ũː', 'uũ')
|
21 |
s = s.replace('uː', 'uu')
|
@@ -74,13 +75,13 @@ def to_roheng(s):
|
|
74 |
s = s.replace('j', 'y')
|
75 |
s = s.replace('d͡ʒ', 'j')
|
76 |
|
77 |
-
s = s.replace('ɑ̃ː', '
|
78 |
s = s.replace('ɑː', 'ɑɑ')
|
79 |
s = s.replace('ẽː', 'eẽ')
|
80 |
s = s.replace('eː', 'ee')
|
81 |
s = s.replace('ĩː', 'iĩ')
|
82 |
s = s.replace('iː', 'ii')
|
83 |
-
s = s.replace('ɔ̃ː', '
|
84 |
s = s.replace('ɔː', 'ɔɔ')
|
85 |
s = s.replace('ũː', 'uũ')
|
86 |
s = s.replace('uː', 'uu')
|
@@ -96,6 +97,12 @@ def to_roheng(s):
|
|
96 |
def convert_script(input_script, output_script, input_text):
|
97 |
epi = epitran.Epitran(input_script)
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
# store indices for capitalized words (will assume only first letter is capitalized)
|
100 |
words = input_text.split()
|
101 |
capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
|
|
|
1 |
import epitran
|
2 |
+
import re
|
3 |
|
4 |
def to_lroh(s):
|
5 |
s = s.replace('ɖ', 'ḍ')
|
|
|
10 |
s = s.replace('j', 'y')
|
11 |
s = s.replace('d͡ʒ', 'j')
|
12 |
|
13 |
+
s = s.replace('ɑ̃ː', 'ɑɑ̃')
|
14 |
s = s.replace('ɑː', 'ɑɑ')
|
15 |
s = s.replace('ẽː', 'eẽ')
|
16 |
s = s.replace('eː', 'ee')
|
17 |
s = s.replace('ĩː', 'iĩ')
|
18 |
s = s.replace('iː', 'ii')
|
19 |
+
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
20 |
s = s.replace('ɔː', 'ɔɔ')
|
21 |
s = s.replace('ũː', 'uũ')
|
22 |
s = s.replace('uː', 'uu')
|
|
|
75 |
s = s.replace('j', 'y')
|
76 |
s = s.replace('d͡ʒ', 'j')
|
77 |
|
78 |
+
s = s.replace('ɑ̃ː', 'ɑɑ̃')
|
79 |
s = s.replace('ɑː', 'ɑɑ')
|
80 |
s = s.replace('ẽː', 'eẽ')
|
81 |
s = s.replace('eː', 'ee')
|
82 |
s = s.replace('ĩː', 'iĩ')
|
83 |
s = s.replace('iː', 'ii')
|
84 |
+
s = s.replace('ɔ̃ː', 'ɔɔ̃')
|
85 |
s = s.replace('ɔː', 'ɔɔ')
|
86 |
s = s.replace('ũː', 'uũ')
|
87 |
s = s.replace('uː', 'uu')
|
|
|
97 |
def convert_script(input_script, output_script, input_text):
|
98 |
epi = epitran.Epitran(input_script)
|
99 |
|
100 |
+
# initial step to account for 'R' in the asterisk step -
|
101 |
+
#replaces non-word initial 'R's with 'rh' for Epitran processing
|
102 |
+
if (input_script == 'asterisk'):
|
103 |
+
input_text = re.sub(r'(?<=\B)R', 'rh', input_text, flags=re.IGNORECASE)
|
104 |
+
input_text = input_text.replace('*R', '*rh')
|
105 |
+
|
106 |
# store indices for capitalized words (will assume only first letter is capitalized)
|
107 |
words = input_text.split()
|
108 |
capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
|
output.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
Pedṛottun ekkan moṭosaiṛkol ase aaṛ hite iian sola beci foson goṛe. Uggwa insaanoṛ motoṛ kuci goṛi bolla Ameṛicar maincoṛ ḍoõr uggwa ḍai White Houseoṛ baṛkule en goṛi zoma oie, zenḍilla hitaṛa Football kela zitile kuci goitto. Aãr aãr ḍen ḍakoṛ ãṛuot nofuṛede henḍilla kissu kiṛim lagaiṛ. Aãi nosinide maya-fuaindoṛe zetobaṛ appa di hetobaṛ aãttu ṭandi-zoṛ oito boli, aãttu maya-fuaindoṛ hono baiggo notakito. Aãr ham oilde jinic in zanon. Soledde ṭaimoṛo owṛe, haṛhana hode iin ekkan ocantiṛ zagaṛ cundoijja mesal. Gaṛi gane gonḍyae ebbeṛe beci fãs kilomiṭaṛ baingil. Zaṛa soyi muhabbot goṛoia ase hitaṛa hitaṛaṛ kuciṛ hota hono-din noleke. Ceila loi aãi loi fuṛan fuãjja. Maya-fuain gaṛi soloia lagataṛ hotoṛa. Hibaie aãre kissu mozaṛ hota hoil aaṛ mozaṛ gana gail ugga biaṛaimmaṛ ḍakottu. Zodi tũi zõlot maze ballukoṛ fãzot foṛi zogoi, tuãr sobse behetoṛ toṛika oilde meṛit neṛi moṛi gio goi fan goṛon.
|