micahg commited on
Commit
72b7374
1 Parent(s): b4a732f

spelling edits

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. functions.py +15 -0
.gitignore CHANGED
@@ -1,5 +1,7 @@
1
  transliterate.py
2
  output.txt
 
 
3
  *.7z filter=lfs diff=lfs merge=lfs -text
4
  *.arrow filter=lfs diff=lfs merge=lfs -text
5
  *.bin filter=lfs diff=lfs merge=lfs -text
 
1
  transliterate.py
2
  output.txt
3
+ process_folder.py
4
+ process_all_folders.py
5
  *.7z filter=lfs diff=lfs merge=lfs -text
6
  *.arrow filter=lfs diff=lfs merge=lfs -text
7
  *.bin filter=lfs diff=lfs merge=lfs -text
functions.py CHANGED
@@ -1,5 +1,6 @@
1
  import epitran
2
  import re
 
3
 
4
  def to_lroh(s):
5
  s = s.replace('ɖ', 'ḍ')
@@ -104,6 +105,11 @@ def to_roheng(s):
104
  """
105
 
106
  # step to standardize all nasalized vowels as precomposed characters
 
 
 
 
 
107
  s = re.sub('ã', 'ã', s)
108
  s = re.sub('ẽ', 'ẽ', s)
109
  s = re.sub('ĩ', 'ĩ', s)
@@ -120,6 +126,15 @@ def to_roheng(s):
120
  # dipthongs/glides
121
  #elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
122
  words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
 
 
 
 
 
 
 
 
 
123
 
124
  s = ' '.join(words)
125
 
 
1
  import epitran
2
  import re
3
+ import string
4
 
5
  def to_lroh(s):
6
  s = s.replace('ɖ', 'ḍ')
 
105
  """
106
 
107
  # step to standardize all nasalized vowels as precomposed characters
108
+ s = re.sub('Ã', 'Ã', s)
109
+ s = re.sub('Ẽ', 'Ẽ', s)
110
+ s = re.sub('Ĩ', 'Ĩ', s)
111
+ s = re.sub('Õ', 'Õ', s)
112
+ s = re.sub('Ũ', 'Ũ', s)
113
  s = re.sub('ã', 'ã', s)
114
  s = re.sub('ẽ', 'ẽ', s)
115
  s = re.sub('ĩ', 'ĩ', s)
 
126
  # dipthongs/glides
127
  #elif re.search(r'[aãeẽiĩoõuũ]{2}', words[i]):
128
  words[i] = re.sub(r'([iĩ])([aãeẽoõuũ])', r'\1y\2', words[i])
129
+
130
+ # spelling errors
131
+ """
132
+ TODO: replace with dictionary to map
133
+ """
134
+ if ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'in':
135
+ words[i] = words[i].replace('in', 'iin')
136
+ elif ''.join(char for char in words[i].strip() if char not in string.punctuation) == 'hin':
137
+ words[i] = words[i].replace('hin', 'hiin')
138
 
139
  s = ' '.join(words)
140