DWizard commited on
Commit
1de6702
1 Parent(s): d438792

optimize spell check

Browse files

Former-commit-id: 26b78cae5e1d9aaf72a21fd13900a9fdb5f381d4

Files changed (1) hide show
  1. SRT.py +32 -9
SRT.py CHANGED
@@ -178,16 +178,20 @@ class SRT_script():
178
  # TODO: variety of translation
179
 
180
  # load term dictionary
181
- # with open("dict_enzh.csv",'r', encoding='utf-8') as f:
182
- # csv_reader = reader(f)
183
- # term_enzh_dict = {rows[0]:rows[1] for rows in csv_reader}
184
 
185
  # change term
186
  for seg in self.segments:
187
  ready_words = seg.source_text.split(" ")
188
  for i in range(len(ready_words)):
189
  word = ready_words[i]
190
- ready_words[i] = self.spell_correction(word, 0)
 
 
 
 
 
191
  # if word[-2:] == ".\n":
192
  # if word[:-2].lower() in term_enzh_dict:
193
  # new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
@@ -206,15 +210,22 @@ class SRT_script():
206
  ## known bug: I've will be replaced because i've is not in the dict
207
 
208
 
209
- # import enchant
210
- # dict = enchant.Dict('en_US')
211
- # term_spellDict = enchant.PyPWL('project-t/finetune_data/dict_freq.txt')
212
 
213
  for seg in self.segments:
214
  ready_words = seg.source_text.split(" ")
215
  for i in range(len(ready_words)):
216
  word = ready_words[i]
217
- ready_words[i] = self.spell_correction(word, 1)
 
 
 
 
 
 
 
218
  # if word[-2:] == ".\n":
219
  # real_word = word[:-2]
220
  # if not dict.check(real_word.lower()):
@@ -265,4 +276,16 @@ class SRT_script():
265
  if not dict.check(real_word):
266
  if term_spellDict.suggest(real_word): # relax spell check
267
  new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
268
- return new_word
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  # TODO: variety of translation
179
 
180
  # load term dictionary
181
+ with open("dict_enzh.csv",'r', encoding='utf-8') as f:
182
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
 
183
 
184
  # change term
185
  for seg in self.segments:
186
  ready_words = seg.source_text.split(" ")
187
  for i in range(len(ready_words)):
188
  word = ready_words[i]
189
+ [real_word, pos] = self.get_real_word(word)
190
+ if real_word in term_enzh_dict:
191
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
192
+ else:
193
+ new_word = word
194
+ ready_words[i] = new_word
195
  # if word[-2:] == ".\n":
196
  # if word[:-2].lower() in term_enzh_dict:
197
  # new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
 
210
  ## known bug: I've will be replaced because i've is not in the dict
211
 
212
 
213
+ import enchant
214
+ dict = enchant.Dict('en_US')
215
+ term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
216
 
217
  for seg in self.segments:
218
  ready_words = seg.source_text.split(" ")
219
  for i in range(len(ready_words)):
220
  word = ready_words[i]
221
+ [real_word, pos] = self.get_real_word(word)
222
+ if not dict.check(real_word):
223
+ suggest = term_spellDict.suggest(real_word)
224
+ if suggest: # relax spell check
225
+ new_word = word.replace(word[:pos],suggest[0])
226
+ else:
227
+ new_word = word
228
+ ready_words[i] = new_word
229
  # if word[-2:] == ".\n":
230
  # real_word = word[:-2]
231
  # if not dict.check(real_word.lower()):
 
276
  if not dict.check(real_word):
277
  if term_spellDict.suggest(real_word): # relax spell check
278
  new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
279
+ return new_word
280
+
281
+ def get_real_word(self, word:str):
282
+ if word[-2:] == ".\n":
283
+ real_word = word[:-2].lower()
284
+ n = -2
285
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
286
+ real_word = word[:-1].lower()
287
+ n = -1
288
+ else:
289
+ real_word = word.lower()
290
+ n = 0
291
+ return real_word, len(word)+n