Spaces:
Running
Running
spelling edits
Browse files- .gitignore +2 -0
- functions.py +15 -0
.gitignore
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
transliterate.py
|
2 |
output.txt
|
|
|
|
|
3 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
4 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
5 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
transliterate.py
|
2 |
output.txt
|
3 |
+
process_folder.py
|
4 |
+
process_all_folders.py
|
5 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
6 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
7 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
functions.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import epitran
|
2 |
import re
|
|
|
3 |
|
4 |
def to_lroh(s):
|
5 |
s = s.replace('ɖ', 'ḍ')
|
@@ -104,6 +105,11 @@ def to_roheng(s):
|
|
104 |
"""
|
105 |
|
106 |
# step to standardize all nasalized vowels as precomposed characters
|
|
|
|
|
|
|
|
|
|
|
107 |
s = re.sub('ã', 'ã', s)
|
108 |
s = re.sub('ẽ', 'ẽ', s)
|
109 |
s = re.sub('ĩ', 'ĩ', s)
|
@@ -120,6 +126,15 @@ def to_roheng(s):
|
|
120 |
# dipthongs/glides
|
121 |
#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
|
122 |
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
s = ' '.join(words)
|
125 |
|
|
|
1 |
import epitran
|
2 |
import re
|
3 |
+
import string
|
4 |
|
5 |
def to_lroh(s):
|
6 |
s = s.replace('ɖ', 'ḍ')
|
|
|
105 |
"""
|
106 |
|
107 |
# step to standardize all nasalized vowels as precomposed characters
|
108 |
+
s = re.sub('Ã', 'Ã', s)
|
109 |
+
s = re.sub('Ẽ', 'Ẽ', s)
|
110 |
+
s = re.sub('Ĩ', 'Ĩ', s)
|
111 |
+
s = re.sub('Õ', 'Õ', s)
|
112 |
+
s = re.sub('Ũ', 'Ũ', s)
|
113 |
s = re.sub('ã', 'ã', s)
|
114 |
s = re.sub('ẽ', 'ẽ', s)
|
115 |
s = re.sub('ĩ', 'ĩ', s)
|
|
|
126 |
# dipthongs/glides
|
127 |
#elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
|
128 |
words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
|
129 |
+
|
130 |
+
# spelling errors
|
131 |
+
"""
|
132 |
+
TODO: replace with dictionary to map
|
133 |
+
"""
|
134 |
+
if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in':
|
135 |
+
words[i] = words[i].replace('in', 'iin')
|
136 |
+
elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin':
|
137 |
+
words[i] = words[i].replace('hin', 'hiin')
|
138 |
|
139 |
s = ' '.join(words)
|
140 |
|