micahg commited on
Commit
e2e57a3
1 Parent(s): a061e11

file dnd fix; line output fix; vowel handling

Browse files
.gitignore CHANGED
@@ -1,3 +1,5 @@
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
1
+ transliterate.py
2
+ output.txt
3
  *.7z filter=lfs diff=lfs merge=lfs -text
4
  *.arrow filter=lfs diff=lfs merge=lfs -text
5
  *.bin filter=lfs diff=lfs merge=lfs -text
epitran/data/pre/asterisk.txt CHANGED
@@ -10,7 +10,7 @@
10
  ' -> 0 / _
11
 
12
  % vowel glides
13
- w -> 0 / (u|u\*) _ (a|o|e)
14
  y -> 0 / (i|i\*) _ (a|e|o|u)
15
 
16
  % nasalization
 
10
  ' -> 0 / _
11
 
12
  % vowel glides
13
+ w -> 0 / (u|u\*) _ (a|e|i|o)
14
  y -> 0 / (i|i\*) _ (a|e|o|u)
15
 
16
  % nasalization
epitran/data/pre/rhg-lroh.txt CHANGED
@@ -9,7 +9,7 @@
9
  ú -> u / _
10
 
11
  % vowel glides
12
- w -> 0 / (u|ũ) _ (a|o|e)
13
  y -> 0 / (i|ĩ) _ (a|e|o|u)
14
 
15
  % long vowels
 
9
  ú -> u / _
10
 
11
  % vowel glides
12
+ w -> 0 / (u|ũ) _ (a|e|i|o)
13
  y -> 0 / (i|ĩ) _ (a|e|o|u)
14
 
15
  % long vowels
epitran/data/pre/rhg-roheng-old.txt CHANGED
@@ -9,5 +9,5 @@
9
  ú -> u / _
10
 
11
  % vowel glides
12
- w -> 0 / (u|uñ) _ (a|o|e)
13
  y -> 0 / (i|iñ) _ (a|e|o|u)
 
9
  ú -> u / _
10
 
11
  % vowel glides
12
+ w -> 0 / (u|uñ) _ (a|e|i|o)
13
  y -> 0 / (i|iñ) _ (a|e|o|u)
epitran/data/pre/rhg-roheng.txt CHANGED
@@ -9,5 +9,5 @@
9
  ú -> u / _
10
 
11
  % vowel glides
12
- w -> 0 / (u|ũ) _ (a|o|e)
13
  y -> 0 / (i|ĩ) _ (a|e|o|u)
 
9
  ú -> u / _
10
 
11
  % vowel glides
12
+ w -> 0 / (u|ũ) _ (a|e|i|o)
13
  y -> 0 / (i|ĩ) _ (a|e|o|u)
functions.py CHANGED
@@ -91,8 +91,28 @@ def to_roheng(s):
91
 
92
  s = s.replace('ɔ̃', 'õ')
93
  s = s.replace('ɔ', 'o')
 
 
 
 
 
 
 
 
 
94
 
95
- return s
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def convert_script(input_script, output_script, input_text):
98
 
@@ -101,40 +121,46 @@ def convert_script(input_script, output_script, input_text):
101
  epi = epitran.Epitran(input_script)
102
 
103
  # initial step to account for 'R' in the asterisk step -
104
- #replaces non-word initial 'R's with 'rh' for Epitran processing
105
  if (input_script == 'asterisk'):
106
  input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
107
- input_text = input_text.replace('*R', '*rh') # * is treated as a word boundary
108
 
109
  #print (input_text)
110
-
111
- # store indices for capitalized words (will assume only first letter is capitalized)
112
- words = input_text.split()
113
- capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
114
-
115
- grapheme_text = epi.transliterate(input_text)
116
-
117
- #print (grapheme_text)
118
-
119
- if output_script == 'rhg-roheng-old':
120
- inter_text = to_roheng_old(grapheme_text)
121
- elif output_script == 'rhg-lroh':
122
- inter_text = to_lroh(grapheme_text)
123
- elif output_script == 'rhg-roheng':
124
- inter_text = to_roheng(grapheme_text)
125
-
126
- #print (inter_text)
127
-
128
- # reapply capitalization
129
- words = inter_text.split()
130
- for i in capital_indices:
131
- if i < len(words):
132
- words[i] = words[i].capitalize()
133
- output_text = ' '.join(words)
 
 
 
 
 
 
134
 
135
  #print (output_text + '\n##################################################\n')
136
 
137
- return output_text
138
 
139
  # Issues:
140
  #
 
91
 
92
  s = s.replace('ɔ̃', 'õ')
93
  s = s.replace('ɔ', 'o')
94
+
95
+ """
96
+ glides/dipthongs/trithongs
97
+ """
98
+ # insert 'y' after i if it is followed by any vowel
99
+ #s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
100
+ #s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
101
+
102
+ words=s.split(' ')
103
 
104
+ for i in range(len(words)):
105
+ # trithongs
106
+ if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
107
+ words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
108
+ words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
109
+ # dipthongs/glides
110
+ elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
111
+ words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
112
+
113
+ temp_s = ' '.join(words)
114
+
115
+ return temp_s
116
 
117
  def convert_script(input_script, output_script, input_text):
118
 
 
121
  epi = epitran.Epitran(input_script)
122
 
123
  # initial step to account for 'R' in the asterisk step -
124
+ # replaces non-word initial 'R's with 'rh' for Epitran processing
125
  if (input_script == 'asterisk'):
126
  input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
127
+ input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
128
 
129
  #print (input_text)
130
+ lines = input_text.split('\n')
131
+ output_text = ''
132
+
133
+ for line in lines:
134
+ # store indices for capitalized words (will assume only first letter is capitalized)
135
+ words = line.split()
136
+ capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
137
+ #print (capital_indices)
138
+
139
+ grapheme_text = epi.transliterate(line)
140
+
141
+ #print (grapheme_text)
142
+
143
+ if output_script == 'rhg-roheng-old':
144
+ inter_text = to_roheng_old(grapheme_text)
145
+ elif output_script == 'rhg-lroh':
146
+ inter_text = to_lroh(grapheme_text)
147
+ elif output_script == 'rhg-roheng':
148
+ inter_text = to_roheng(grapheme_text)
149
+
150
+ #print (inter_text)
151
+
152
+ # reapply capitalization
153
+ words = inter_text.split()
154
+ for i in capital_indices:
155
+ if i < len(words):
156
+ words[i] = words[i].capitalize()
157
+
158
+ output_line = ' '.join(words)
159
+ output_text = output_text + output_line + '\n'
160
 
161
  #print (output_text + '\n##################################################\n')
162
 
163
+ return output_text.strip()
164
 
165
  # Issues:
166
  #
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- gradio
2
  panphon
3
  setuptools
4
  regex
5
  marisa-trie
6
- requests
 
1
+ gradio==4.14.0
2
  panphon
3
  setuptools
4
  regex
5
  marisa-trie
6
+ requests
vowels.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ #from docx import Document
3
+ import re
4
+
5
+ def find_sequential_vowels (text):
6
+
7
+ """
8
+ TODO: add nasalized vowels
9
+ """
10
+ pattern = r'\b\w*([aeiou])(?!\1)([aeiou])\w*\b'
11
+ return re.findall(pattern, text, re.IGNORECASE)
12
+
13
+ def create_docx_and_html(text, docx_path, html_path):
14
+ words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
15
+
16
+ doc = Document()
17
+ paragraph = doc.add_paragraph()
18
+
19
+ html_content = "<html><body><p>"
20
+
21
+ words = text.split()
22
+
23
+ for word in words:
24
+ if any(bold_word in word for bold_word in words_to_bold):
25
+ paragraph.add_run(word + " ").bold = True
26
+ html_content += "<b>" + word + "</b> "
27
+ else:
28
+ paragraph.add_run(word + " ")
29
+ html_content += word + " "
30
+
31
+ html_content += "</p></body></html>"
32
+
33
+ doc.save(docx_path)
34
+
35
+ with open(html_path, 'w') as html_file:
36
+ html_file.write(html_content)
37
+
38
+ return docx_path, html_path
39
+
40
+
41
+ def format_text(text):
42
+ words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
43
+
44
+ words = text.split()
45
+ formatted_text = ""
46
+
47
+ for word in words:
48
+ if any(bold_word in word for bold_word in words_to_bold):
49
+ formatted_text += f"<b>{word}</b> "
50
+ else:
51
+ formatted_text += f"{word} "
52
+
53
+ return formatted_text
54
+
55
+ with gr.Blocks() as app:
56
+ gr.Markdown("## Sequential Vowels Highlighter")
57
+ with gr.Row():
58
+ text_input = gr.Textbox(lines=2, placeholder="Enter text here...")
59
+ submit_button = gr.Button("Put words with sequential vowels in bold")
60
+ output_html = gr.HTML()
61
+
62
+ submit_button.click(
63
+ fn=format_text,
64
+ inputs=text_input,
65
+ outputs=output_html
66
+ )
67
+
68
+ app.launch()