Spaces:
Sleeping
Sleeping
Merge branch 'eason/main' into SRT_cleanup
Browse filesFormer-commit-id: a9d17c97f798adbdbe29c5cf413d14ff8848236d
- SRT.py +16 -5
- pipeline.py +3 -3
SRT.py
CHANGED
@@ -146,7 +146,7 @@ class SRT_script():
|
|
146 |
merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
|
147 |
sentence = []
|
148 |
for i, seg in enumerate(self.segments):
|
149 |
-
if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10:
|
150 |
sentence.append(i)
|
151 |
merge_list.append(sentence)
|
152 |
sentence = []
|
@@ -157,7 +157,7 @@ class SRT_script():
|
|
157 |
for idx_list in merge_list:
|
158 |
segments.append(self.merge_segs(idx_list))
|
159 |
|
160 |
-
self.segments = segments
|
161 |
|
162 |
def remove_trans_punctuation(self):
|
163 |
"""
|
@@ -417,9 +417,20 @@ class SRT_script():
|
|
417 |
[real_word, pos] = self.get_real_word(word)
|
418 |
if not dict.check(word[:pos]):
|
419 |
suggest = term_spellDict.suggest(real_word)
|
420 |
-
|
421 |
-
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
new_word = word
|
424 |
ready_words[i] = new_word
|
425 |
seg.source_text = " ".join(ready_words)
|
|
|
146 |
merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
|
147 |
sentence = []
|
148 |
for i, seg in enumerate(self.segments):
|
149 |
+
if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
|
150 |
sentence.append(i)
|
151 |
merge_list.append(sentence)
|
152 |
sentence = []
|
|
|
157 |
for idx_list in merge_list:
|
158 |
segments.append(self.merge_segs(idx_list))
|
159 |
|
160 |
+
self.segments = segments
|
161 |
|
162 |
def remove_trans_punctuation(self):
|
163 |
"""
|
|
|
417 |
[real_word, pos] = self.get_real_word(word)
|
418 |
if not dict.check(word[:pos]):
|
419 |
suggest = term_spellDict.suggest(real_word)
|
420 |
+
|
421 |
+
if suggest and enchant.utils.levenshtein(word, suggest[0]) < (len(word)+len(suggest[0]))/4: # relax spell check
|
422 |
+
|
423 |
+
#with open("dislog.log","a") as log:
|
424 |
+
# if not os.path.exists("dislog.log"):
|
425 |
+
# log.write("word \t suggest \t levenshtein \n")
|
426 |
+
# log.write(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
|
427 |
+
print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
|
428 |
+
|
429 |
+
|
430 |
+
new_word = word.replace(word[:pos],suggest[0])
|
431 |
+
else:
|
432 |
+
new_word = word
|
433 |
+
else:
|
434 |
new_word = word
|
435 |
ready_words[i] = new_word
|
436 |
seg.source_text = " ".join(ready_words)
|
pipeline.py
CHANGED
@@ -80,7 +80,7 @@ def get_sources(args, download_path, result_path, video_name):
|
|
80 |
audio_path = args.audio_file
|
81 |
pass
|
82 |
|
83 |
-
return audio_path, audio_file, video_path
|
84 |
|
85 |
def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'base', method = "stable"):
|
86 |
# Instead of using the script_en variable directly, we'll use script_input
|
@@ -249,13 +249,13 @@ def main():
|
|
249 |
else:
|
250 |
VIDEO_NAME = args.video_name
|
251 |
|
252 |
-
audio_path, audio_file, video_path = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
|
253 |
|
254 |
srt_file_en, srt = get_srt_class(args.srt_file, RESULT_PATH, VIDEO_NAME, audio_path, audio_file)
|
255 |
|
256 |
# SRT class preprocess
|
257 |
srt.form_whole_sentence()
|
258 |
-
|
259 |
srt.correct_with_force_term()
|
260 |
srt.write_srt_file_src(srt_file_en)
|
261 |
script_input = srt.get_source_only()
|
|
|
80 |
audio_path = args.audio_file
|
81 |
pass
|
82 |
|
83 |
+
return audio_path, audio_file, video_path, video_name
|
84 |
|
85 |
def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'base', method = "stable"):
|
86 |
# Instead of using the script_en variable directly, we'll use script_input
|
|
|
249 |
else:
|
250 |
VIDEO_NAME = args.video_name
|
251 |
|
252 |
+
audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
|
253 |
|
254 |
srt_file_en, srt = get_srt_class(args.srt_file, RESULT_PATH, VIDEO_NAME, audio_path, audio_file)
|
255 |
|
256 |
# SRT class preprocess
|
257 |
srt.form_whole_sentence()
|
258 |
+
srt.spell_check_term()
|
259 |
srt.correct_with_force_term()
|
260 |
srt.write_srt_file_src(srt_file_en)
|
261 |
script_input = srt.get_source_only()
|