Spaces:
Sleeping
Sleeping
DWizard
commited on
Commit
•
1de6702
1
Parent(s):
d438792
optimize spell check
Browse filesFormer-commit-id: 26b78cae5e1d9aaf72a21fd13900a9fdb5f381d4
SRT.py
CHANGED
@@ -178,16 +178,20 @@ class SRT_script():
|
|
178 |
# TODO: variety of translation
|
179 |
|
180 |
# load term dictionary
|
181 |
-
|
182 |
-
|
183 |
-
# term_enzh_dict = {rows[0]:rows[1] for rows in csv_reader}
|
184 |
|
185 |
# change term
|
186 |
for seg in self.segments:
|
187 |
ready_words = seg.source_text.split(" ")
|
188 |
for i in range(len(ready_words)):
|
189 |
word = ready_words[i]
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
191 |
# if word[-2:] == ".\n":
|
192 |
# if word[:-2].lower() in term_enzh_dict:
|
193 |
# new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
|
@@ -206,15 +210,22 @@ class SRT_script():
|
|
206 |
## known bug: I've will be replaced because i've is not in the dict
|
207 |
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
|
213 |
for seg in self.segments:
|
214 |
ready_words = seg.source_text.split(" ")
|
215 |
for i in range(len(ready_words)):
|
216 |
word = ready_words[i]
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
# if word[-2:] == ".\n":
|
219 |
# real_word = word[:-2]
|
220 |
# if not dict.check(real_word.lower()):
|
@@ -265,4 +276,16 @@ class SRT_script():
|
|
265 |
if not dict.check(real_word):
|
266 |
if term_spellDict.suggest(real_word): # relax spell check
|
267 |
new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
|
268 |
-
return new_word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
# TODO: variety of translation
|
179 |
|
180 |
# load term dictionary
|
181 |
+
with open("dict_enzh.csv",'r', encoding='utf-8') as f:
|
182 |
+
term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
|
|
|
183 |
|
184 |
# change term
|
185 |
for seg in self.segments:
|
186 |
ready_words = seg.source_text.split(" ")
|
187 |
for i in range(len(ready_words)):
|
188 |
word = ready_words[i]
|
189 |
+
[real_word, pos] = self.get_real_word(word)
|
190 |
+
if real_word in term_enzh_dict:
|
191 |
+
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
192 |
+
else:
|
193 |
+
new_word = word
|
194 |
+
ready_words[i] = new_word
|
195 |
# if word[-2:] == ".\n":
|
196 |
# if word[:-2].lower() in term_enzh_dict:
|
197 |
# new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
|
|
|
210 |
## known bug: I've will be replaced because i've is not in the dict
|
211 |
|
212 |
|
213 |
+
import enchant
|
214 |
+
dict = enchant.Dict('en_US')
|
215 |
+
term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
|
216 |
|
217 |
for seg in self.segments:
|
218 |
ready_words = seg.source_text.split(" ")
|
219 |
for i in range(len(ready_words)):
|
220 |
word = ready_words[i]
|
221 |
+
[real_word, pos] = self.get_real_word(word)
|
222 |
+
if not dict.check(real_word):
|
223 |
+
suggest = term_spellDict.suggest(real_word)
|
224 |
+
if suggest: # relax spell check
|
225 |
+
new_word = word.replace(word[:pos],suggest[0])
|
226 |
+
else:
|
227 |
+
new_word = word
|
228 |
+
ready_words[i] = new_word
|
229 |
# if word[-2:] == ".\n":
|
230 |
# real_word = word[:-2]
|
231 |
# if not dict.check(real_word.lower()):
|
|
|
276 |
if not dict.check(real_word):
|
277 |
if term_spellDict.suggest(real_word): # relax spell check
|
278 |
new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
|
279 |
+
return new_word
|
280 |
+
|
281 |
+
def get_real_word(self, word:str):
|
282 |
+
if word[-2:] == ".\n":
|
283 |
+
real_word = word[:-2].lower()
|
284 |
+
n = -2
|
285 |
+
elif word[-1:] in [".", "\n", ",", "!", "?"]:
|
286 |
+
real_word = word[:-1].lower()
|
287 |
+
n = -1
|
288 |
+
else:
|
289 |
+
real_word = word.lower()
|
290 |
+
n = 0
|
291 |
+
return real_word, len(word)+n
|