Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

micahg commited on Jan 26

Commit

e2e57a3

•

1 Parent(s): a061e11

file dnd fix; line output fix; vowel handling

Browse files

Files changed (8) hide show

.gitignore +2 -0
epitran/data/pre/asterisk.txt +1 -1
epitran/data/pre/rhg-lroh.txt +1 -1
epitran/data/pre/rhg-roheng-old.txt +1 -1
epitran/data/pre/rhg-roheng.txt +1 -1
functions.py +54 -28
requirements.txt +2 -2
vowels.py +68 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

+transliterate.py
+output.txt
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

epitran/data/pre/asterisk.txt CHANGED Viewed

@@ -10,7 +10,7 @@
 ' -> 0 / _
 % vowel glides
-w -> 0 / (u|u\*) _ (a|o|e)
 y -> 0 / (i|i\*) _ (a|e|o|u)
 % nasalization

 ' -> 0 / _
 % vowel glides
+w -> 0 / (u|u\*) _ (a|e|i|o)
 y -> 0 / (i|i\*) _ (a|e|o|u)
 % nasalization

epitran/data/pre/rhg-lroh.txt CHANGED Viewed

@@ -9,7 +9,7 @@
 ú -> u / _
 % vowel glides
-w -> 0 / (u|ũ) _ (a|o|e)
 y -> 0 / (i|ĩ) _ (a|e|o|u)
 % long vowels

 ú -> u / _
 % vowel glides
+w -> 0 / (u|ũ) _ (a|e|i|o)
 y -> 0 / (i|ĩ) _ (a|e|o|u)
 % long vowels

epitran/data/pre/rhg-roheng-old.txt CHANGED Viewed

@@ -9,5 +9,5 @@
 ú -> u / _
 % vowel glides
-w -> 0 / (u|uñ) _ (a|o|e)
 y -> 0 / (i|iñ) _ (a|e|o|u)

 ú -> u / _
 % vowel glides
+w -> 0 / (u|uñ) _ (a|e|i|o)
 y -> 0 / (i|iñ) _ (a|e|o|u)

epitran/data/pre/rhg-roheng.txt CHANGED Viewed

@@ -9,5 +9,5 @@
 ú -> u / _
 % vowel glides
-w -> 0 / (u|ũ) _ (a|o|e)
 y -> 0 / (i|ĩ) _ (a|e|o|u)

 ú -> u / _
 % vowel glides
+w -> 0 / (u|ũ) _ (a|e|i|o)
 y -> 0 / (i|ĩ) _ (a|e|o|u)

functions.py CHANGED Viewed

@@ -91,8 +91,28 @@ def to_roheng(s):
     s = s.replace('ɔ̃', 'õ')
     s = s.replace('ɔ', 'o')
-    return s
 def convert_script(input_script, output_script, input_text):
@@ -101,40 +121,46 @@ def convert_script(input_script, output_script, input_text):
     epi = epitran.Epitran(input_script)
     # initial step to account for 'R' in the asterisk step -
-    #replaces non-word initial 'R's with 'rh' for Epitran processing
     if (input_script == 'asterisk'):
         input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
-        input_text = input_text.replace('*R', '*rh') # * is treated as a word boundary
     #print (input_text)
-    # store indices for capitalized words (will assume only first letter is capitalized)
-    words = input_text.split()
-    capital_indices = [i for i, word in enumerate(words) if word[0].isupper()]
-    grapheme_text = epi.transliterate(input_text)
-    #print (grapheme_text)
-    if output_script == 'rhg-roheng-old':
-        inter_text = to_roheng_old(grapheme_text)
-    elif output_script == 'rhg-lroh':
-        inter_text = to_lroh(grapheme_text)
-    elif output_script == 'rhg-roheng':
-        inter_text = to_roheng(grapheme_text)
-    #print (inter_text)
-    # reapply capitalization
-    words = inter_text.split()
-    for i in capital_indices:
-        if i < len(words):
-            words[i] = words[i].capitalize()
-    output_text = ' '.join(words)
     #print (output_text + '\n##################################################\n')
-    return output_text
 # Issues:
 #

     s = s.replace('ɔ̃', 'õ')
     s = s.replace('ɔ', 'o')
+    """
+    glides/dipthongs/trithongs
+    """
+    # insert 'y' after i if it is followed by any vowel
+    #s = re.sub(r'i([aãeẽoõuũ])', r'iy\1', s) ---- doesn't work if trithongs exist that start with 'i
+    #s = re.sub(r'ĩ([aãeẽoõuũ])', r'ĩy\1', s)
+    words=s.split(' ')
+    for i in range(len(words)):
+        # trithongs
+        if re.search(r'[aãeẽiĩoõuũ]{3}', words[i]):
+            words[i] = re.sub(r'([aãeẽoõuũ])([iĩ])([aãeẽoõuũ])', r'\1\2y\3', words[i])
+            words[i] = re.sub(r'([aãeẽiĩoõ])([uũ])([aãeẽiĩoõ])', r'\1\2w\3', words[i])
+        # dipthongs/glides
+        elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
+            words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
+    temp_s = ' '.join(words)
+    return temp_s
 def convert_script(input_script, output_script, input_text):
     epi = epitran.Epitran(input_script)
     # initial step to account for 'R' in the asterisk step -
+    # replaces non-word initial 'R's with 'rh' for Epitran processing
     if (input_script == 'asterisk'):
         input_text = re.sub(r'(?<=\B)R', 'rh', input_text)
+        input_text = input_text.replace('*R', '*rh') # additional step for '*' since it is treated as a word boundary
     #print (input_text)
+    lines = input_text.split('\n')
+    output_text = ''
+    for line in lines:
+        # store indices for capitalized words (will assume only first letter is capitalized)
+        words = line.split()
+        capital_indices = [i for i, word in enumerate(words) if word and word[0].isupper()]
+        #print (capital_indices)
+        grapheme_text = epi.transliterate(line)
+        #print (grapheme_text)
+        if output_script == 'rhg-roheng-old':
+            inter_text = to_roheng_old(grapheme_text)
+        elif output_script == 'rhg-lroh':
+            inter_text = to_lroh(grapheme_text)
+        elif output_script == 'rhg-roheng':
+            inter_text = to_roheng(grapheme_text)
+        #print (inter_text)
+        # reapply capitalization
+        words = inter_text.split()
+        for i in capital_indices:
+            if i < len(words):
+                words[i] = words[i].capitalize()
+        output_line = ' '.join(words)
+        output_text = output_text + output_line + '\n'
     #print (output_text + '\n##################################################\n')
+    return output_text.strip()
 # Issues:
 #

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
-gradio
 panphon
 setuptools
 regex
 marisa-trie
-requests

+gradio==4.14.0
 panphon
 setuptools
 regex
 marisa-trie
+requests

vowels.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import gradio as gr
+#from docx import Document
+import re
+def find_sequential_vowels (text):
+    """
+    TODO: add nasalized vowels
+    """
+    pattern = r'\b\w*([aeiou])(?!\1)([aeiou])\w*\b'
+    return re.findall(pattern, text, re.IGNORECASE)
+def create_docx_and_html(text, docx_path, html_path):
+    words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
+    doc = Document()
+    paragraph = doc.add_paragraph()
+    html_content = "<html><body><p>"
+    words = text.split()
+   for word in words:
+       if any(bold_word in word for bold_word in words_to_bold):
+            paragraph.add_run(word + " ").bold = True
+            html_content += "<b>" + word + "</b> "
+        else:
+            paragraph.add_run(word + " ")
+            html_content += word + " "
+    html_content += "</p></body></html>"
+    doc.save(docx_path)
+    with open(html_path, 'w') as html_file:
+        html_file.write(html_content)
+    return docx_path, html_path
+def format_text(text):
+    words_to_bold = [word[0]+word[1] for word in find_sequential_vowels(text)]
+    words = text.split()
+    formatted_text = ""
+    for word in words:
+        if any(bold_word in word for bold_word in words_to_bold):
+            formatted_text += f"<b>{word}</b> "
+        else:
+            formatted_text += f"{word} "
+    return formatted_text
+with gr.Blocks() as app:
+    gr.Markdown("## Sequential Vowels Highlighter")
+    with gr.Row():
+        text_input = gr.Textbox(lines=2, placeholder="Enter text here...")
+        submit_button = gr.Button("Put words with sequential vowels in bold")
+    output_html = gr.HTML()
+    submit_button.click(
+        fn=format_text,
+        inputs=text_input,
+        outputs=output_html
+    )
+app.launch()