yichenl5 commited on
Commit
c9578de
2 Parent(s): 48f0069 bf542d7

Merge branch 'eason/main' into SRT_cleanup

Browse files

Former-commit-id: a9d17c97f798adbdbe29c5cf413d14ff8848236d

Files changed (2) hide show
  1. SRT.py +16 -5
  2. pipeline.py +3 -3
SRT.py CHANGED
@@ -146,7 +146,7 @@ class SRT_script():
146
  merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
147
  sentence = []
148
  for i, seg in enumerate(self.segments):
149
- if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10:
150
  sentence.append(i)
151
  merge_list.append(sentence)
152
  sentence = []
@@ -157,7 +157,7 @@ class SRT_script():
157
  for idx_list in merge_list:
158
  segments.append(self.merge_segs(idx_list))
159
 
160
- self.segments = segments # need memory release?
161
 
162
  def remove_trans_punctuation(self):
163
  """
@@ -417,9 +417,20 @@ class SRT_script():
417
  [real_word, pos] = self.get_real_word(word)
418
  if not dict.check(word[:pos]):
419
  suggest = term_spellDict.suggest(real_word)
420
- if suggest: # relax spell check
421
- new_word = word.replace(word[:pos], suggest[0])
422
- else:
 
 
 
 
 
 
 
 
 
 
 
423
  new_word = word
424
  ready_words[i] = new_word
425
  seg.source_text = " ".join(ready_words)
 
146
  merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
147
  sentence = []
148
  for i, seg in enumerate(self.segments):
149
+ if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
150
  sentence.append(i)
151
  merge_list.append(sentence)
152
  sentence = []
 
157
  for idx_list in merge_list:
158
  segments.append(self.merge_segs(idx_list))
159
 
160
+ self.segments = segments
161
 
162
  def remove_trans_punctuation(self):
163
  """
 
417
  [real_word, pos] = self.get_real_word(word)
418
  if not dict.check(word[:pos]):
419
  suggest = term_spellDict.suggest(real_word)
420
+
421
+ if suggest and enchant.utils.levenshtein(word, suggest[0]) < (len(word)+len(suggest[0]))/4: # relax spell check
422
+
423
+ #with open("dislog.log","a") as log:
424
+ # if not os.path.exists("dislog.log"):
425
+ # log.write("word \t suggest \t levenshtein \n")
426
+ # log.write(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
427
+ print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
428
+
429
+
430
+ new_word = word.replace(word[:pos],suggest[0])
431
+ else:
432
+ new_word = word
433
+ else:
434
  new_word = word
435
  ready_words[i] = new_word
436
  seg.source_text = " ".join(ready_words)
pipeline.py CHANGED
@@ -80,7 +80,7 @@ def get_sources(args, download_path, result_path, video_name):
80
  audio_path = args.audio_file
81
  pass
82
 
83
- return audio_path, audio_file, video_path
84
 
85
  def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'base', method = "stable"):
86
  # Instead of using the script_en variable directly, we'll use script_input
@@ -249,13 +249,13 @@ def main():
249
  else:
250
  VIDEO_NAME = args.video_name
251
 
252
- audio_path, audio_file, video_path = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
253
 
254
  srt_file_en, srt = get_srt_class(args.srt_file, RESULT_PATH, VIDEO_NAME, audio_path, audio_file)
255
 
256
  # SRT class preprocess
257
  srt.form_whole_sentence()
258
- # srt.spell_check_term()
259
  srt.correct_with_force_term()
260
  srt.write_srt_file_src(srt_file_en)
261
  script_input = srt.get_source_only()
 
80
  audio_path = args.audio_file
81
  pass
82
 
83
+ return audio_path, audio_file, video_path, video_name
84
 
85
  def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'base', method = "stable"):
86
  # Instead of using the script_en variable directly, we'll use script_input
 
249
  else:
250
  VIDEO_NAME = args.video_name
251
 
252
+ audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
253
 
254
  srt_file_en, srt = get_srt_class(args.srt_file, RESULT_PATH, VIDEO_NAME, audio_path, audio_file)
255
 
256
  # SRT class preprocess
257
  srt.form_whole_sentence()
258
+ srt.spell_check_term()
259
  srt.correct_with_force_term()
260
  srt.write_srt_file_src(srt_file_en)
261
  script_input = srt.get_source_only()