Spaces:
Sleeping
Sleeping
file dnd fix; line output fix; vowel handling
Browse files- .gitignore +2 -0
- epitran/data/pre/asterisk.txt +1 -1
- epitran/data/pre/rhg-lroh.txt +1 -1
- epitran/data/pre/rhg-roheng-old.txt +1 -1
- epitran/data/pre/rhg-roheng.txt +1 -1
- functions.py +54 -28
- requirements.txt +2 -2
- vowels.py +68 -0
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
transliterate.py
|
2 |
+
output.txt
|
3 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
4 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
5 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
epitran/data/pre/asterisk.txt
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
' -> 0 / _
|
11 |
|
12 |
% vowel glides
|
13 |
-
w -> 0 / (u|u\*) _ (a|o
|
14 |
y -> 0 / (i|i\*) _ (a|e|o|u)
|
15 |
|
16 |
% nasalization
|
|
|
10 |
' -> 0 / _
|
11 |
|
12 |
% vowel glides
|
13 |
+
w -> 0 / (u|u\*) _ (a|e|i|o)
|
14 |
y -> 0 / (i|i\*) _ (a|e|o|u)
|
15 |
|
16 |
% nasalization
|
epitran/data/pre/rhg-lroh.txt
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
-
w -> 0 / (u|ũ) _ (a|o
|
13 |
y -> 0 / (i|ĩ) _ (a|e|o|u)
|
14 |
|
15 |
% long vowels
|
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
+
w -> 0 / (u|ũ) _ (a|e|i|o)
|
13 |
y -> 0 / (i|ĩ) _ (a|e|o|u)
|
14 |
|
15 |
% long vowels
|
epitran/data/pre/rhg-roheng-old.txt
CHANGED
@@ -9,5 +9,5 @@
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
-
w -> 0 / (u|uñ) _ (a|o
|
13 |
y -> 0 / (i|iñ) _ (a|e|o|u)
|
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
+
w -> 0 / (u|uñ) _ (a|e|i|o)
|
13 |
y -> 0 / (i|iñ) _ (a|e|o|u)
|
epitran/data/pre/rhg-roheng.txt
CHANGED
@@ -9,5 +9,5 @@
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
-
w -> 0 / (u|ũ) _ (a|o
|
13 |
y -> 0 / (i|ĩ) _ (a|e|o|u)
|
|
|
9 |
ú -> u / _
|
10 |
|
11 |
% vowel glides
|
12 |
+
w -> 0 / (u|ũ) _ (a|e|i|o)
|
13 |
y -> 0 / (i|ĩ) _ (a|e|o|u)
|
functions.py
CHANGED
@@ -91,8 +91,28 @@ def to_roheng(s):
|
|
91 |
|
92 |
s = s.replace('ɔ̃', 'õ')
|
93 |
s = s.replace('ɔ', 'o')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def convert_script(input_script, output_script, input_text):
|
98 |
|
@@ -101,40 +121,46 @@ def convert_script(input_script, output_script, input_text):
|
|
101 |
epi = epitran.Epitran(input_script)
|
102 |
|
103 |
# initial step to account for 'R' in the asterisk step -
|
104 |
-
#replaces non-word initial 'R's with 'rh' for Epitran processing
|
105 |
if (input_script == 'asterisk'):
|
106 |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
|
107 |
-
input_text = input_text.replace('*R', '*rh') # * is treated as a word boundary
|
108 |
|
109 |
#print (input_text)
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
#print (output_text + '\n##################################################\n')
|
136 |
|
137 |
-
return output_text
|
138 |
|
139 |
# Issues:
|
140 |
#
|
|
|
91 |
|
92 |
s = s.replace('ɔ̃', 'õ')
|
93 |
s = s.replace('ɔ', 'o')
|
94 |
+
|
95 |
+
"""
|
96 |
+
glides/dipthongs/trithongs
|
97 |
+
"""
|
98 |
+
# insert 'y' after i if it is followed by any vowel
|
99 |
+
#s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
|
100 |
+
#s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
|
101 |
+
|
102 |
+
words=s.split(' ')
|
103 |
|
104 |
+
for i in range(len(words)):
|
105 |
+
# trithongs
|
106 |
+
if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
|
107 |
+
words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
|
108 |
+
words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
|
109 |
+
# dipthongs/glides
|
110 |
+
elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
|
111 |
+
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
|
112 |
+
|
113 |
+
temp_s = ' '.join(words)
|
114 |
+
|
115 |
+
return temp_s
|
116 |
|
117 |
def convert_script(input_script, output_script, input_text):
|
118 |
|
|
|
121 |
epi = epitran.Epitran(input_script)
|
122 |
|
123 |
# initial step to account for 'R' in the asterisk step -
|
124 |
+
# replaces non-word initial 'R's with 'rh' for Epitran processing
|
125 |
if (input_script == 'asterisk'):
|
126 |
input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
|
127 |
+
input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
|
128 |
|
129 |
#print (input_text)
|
130 |
+
lines = input_text.split('\n')
|
131 |
+
output_text = ''
|
132 |
+
|
133 |
+
for line in lines:
|
134 |
+
# store indices for capitalized words (will assume only first letter is capitalized)
|
135 |
+
words = line.split()
|
136 |
+
capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
|
137 |
+
#print (capital_indices)
|
138 |
+
|
139 |
+
grapheme_text = epi.transliterate(line)
|
140 |
+
|
141 |
+
#print (grapheme_text)
|
142 |
+
|
143 |
+
if output_script == 'rhg-roheng-old':
|
144 |
+
inter_text = to_roheng_old(grapheme_text)
|
145 |
+
elif output_script == 'rhg-lroh':
|
146 |
+
inter_text = to_lroh(grapheme_text)
|
147 |
+
elif output_script == 'rhg-roheng':
|
148 |
+
inter_text = to_roheng(grapheme_text)
|
149 |
+
|
150 |
+
#print (inter_text)
|
151 |
+
|
152 |
+
# reapply capitalization
|
153 |
+
words = inter_text.split()
|
154 |
+
for i in capital_indices:
|
155 |
+
if i < len(words):
|
156 |
+
words[i] = words[i].capitalize()
|
157 |
+
|
158 |
+
output_line = ' '.join(words)
|
159 |
+
output_text = output_text + output_line + '\n'
|
160 |
|
161 |
#print (output_text + '\n##################################################\n')
|
162 |
|
163 |
+
return output_text.strip()
|
164 |
|
165 |
# Issues:
|
166 |
#
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
gradio
|
2 |
panphon
|
3 |
setuptools
|
4 |
regex
|
5 |
marisa-trie
|
6 |
-
requests
|
|
|
1 |
+
gradio==4.14.0
|
2 |
panphon
|
3 |
setuptools
|
4 |
regex
|
5 |
marisa-trie
|
6 |
+
requests
|
vowels.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
#from docx import Document
|
3 |
+
import re
|
4 |
+
|
5 |
+
def find_sequential_vowels (text):
|
6 |
+
|
7 |
+
"""
|
8 |
+
TODO: add nasalized vowels
|
9 |
+
"""
|
10 |
+
pattern = r'\b\w*([aeiou])(?!\1)([aeiou])\w*\b'
|
11 |
+
return re.findall(pattern, text, re.IGNORECASE)
|
12 |
+
|
13 |
+
def create_docx_and_html(text, docx_path, html_path):
|
14 |
+
words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
|
15 |
+
|
16 |
+
doc = Document()
|
17 |
+
paragraph = doc.add_paragraph()
|
18 |
+
|
19 |
+
html_content = "<html><body><p>"
|
20 |
+
|
21 |
+
words = text.split()
|
22 |
+
|
23 |
+
for word in words:
|
24 |
+
if any(bold_word in word for bold_word in words_to_bold):
|
25 |
+
paragraph.add_run(word + " ").bold = True
|
26 |
+
html_content += "<b>" + word + "</b> "
|
27 |
+
else:
|
28 |
+
paragraph.add_run(word + " ")
|
29 |
+
html_content += word + " "
|
30 |
+
|
31 |
+
html_content += "</p></body></html>"
|
32 |
+
|
33 |
+
doc.save(docx_path)
|
34 |
+
|
35 |
+
with open(html_path, 'w') as html_file:
|
36 |
+
html_file.write(html_content)
|
37 |
+
|
38 |
+
return docx_path, html_path
|
39 |
+
|
40 |
+
|
41 |
+
def format_text(text):
|
42 |
+
words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
|
43 |
+
|
44 |
+
words = text.split()
|
45 |
+
formatted_text = ""
|
46 |
+
|
47 |
+
for word in words:
|
48 |
+
if any(bold_word in word for bold_word in words_to_bold):
|
49 |
+
formatted_text += f"<b>{word}</b> "
|
50 |
+
else:
|
51 |
+
formatted_text += f"{word} "
|
52 |
+
|
53 |
+
return formatted_text
|
54 |
+
|
55 |
+
with gr.Blocks() as app:
|
56 |
+
gr.Markdown("## Sequential Vowels Highlighter")
|
57 |
+
with gr.Row():
|
58 |
+
text_input = gr.Textbox(lines=2, placeholder="Enter text here...")
|
59 |
+
submit_button = gr.Button("Put words with sequential vowels in bold")
|
60 |
+
output_html = gr.HTML()
|
61 |
+
|
62 |
+
submit_button.click(
|
63 |
+
fn=format_text,
|
64 |
+
inputs=text_input,
|
65 |
+
outputs=output_html
|
66 |
+
)
|
67 |
+
|
68 |
+
app.launch()
|