Spaces:
Sleeping
Sleeping
solving split bug TODO: bilingual issue
Browse filesFormer-commit-id: ca894904a230ea50f15ead328500ffb8a3ae6452
- SRT.py +21 -8
- pipeline.py +7 -7
SRT.py
CHANGED
@@ -193,8 +193,8 @@ class SRT_script():
|
|
193 |
|
194 |
def split_seg(self, seg, threshold):
|
195 |
# evenly split seg to 2 parts and add new seg into self.segments
|
196 |
-
if seg.source_text[
|
197 |
-
seg.source_text = seg.source_text[
|
198 |
if seg.translation[0] == ',':
|
199 |
seg.translation = seg.translation[1:]
|
200 |
source_text = seg.source_text
|
@@ -205,7 +205,10 @@ class SRT_script():
|
|
205 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
206 |
else:
|
207 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
208 |
-
|
|
|
|
|
|
|
209 |
|
210 |
if len(trans_commas) != 0:
|
211 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
@@ -248,7 +251,8 @@ class SRT_script():
|
|
248 |
|
249 |
|
250 |
def check_len_and_split(self, threshold=30):
|
251 |
-
#
|
|
|
252 |
segments = []
|
253 |
for seg in self.segments:
|
254 |
if len(seg.translation) > threshold:
|
@@ -262,20 +266,22 @@ class SRT_script():
|
|
262 |
pass
|
263 |
|
264 |
def check_len_and_split_range(self, range, threshold=30):
|
265 |
-
#
|
266 |
start_seg_id = range[0]
|
267 |
end_seg_id = range[1]
|
|
|
268 |
segments = []
|
269 |
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
270 |
if len(seg.translation) > threshold:
|
271 |
seg_list = self.split_seg(seg, threshold)
|
272 |
segments += seg_list
|
|
|
273 |
else:
|
274 |
segments.append(seg)
|
275 |
|
276 |
self.segments[start_seg_id-1:end_seg_id] = segments
|
277 |
|
278 |
-
return
|
279 |
|
280 |
def get_source_only(self):
|
281 |
# return a string with pure source text
|
@@ -419,7 +425,12 @@ class SRT_script():
|
|
419 |
start_seg_id = range[0]
|
420 |
end_seg_id = range[1]
|
421 |
with open(path, "a", encoding='utf-8') as f:
|
422 |
-
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
|
|
|
|
|
|
|
|
|
|
|
423 |
f.write(f'{i+idx}\n')
|
424 |
f.write(seg.get_trans_str())
|
425 |
pass
|
@@ -428,7 +439,9 @@ class SRT_script():
|
|
428 |
start_seg_id = range[0]
|
429 |
end_seg_id = range[1]
|
430 |
with open(path, "a", encoding='utf-8') as f:
|
431 |
-
for i, seg in enumerate(self.segments
|
|
|
|
|
432 |
f.write(f'{i+idx}\n')
|
433 |
f.write(seg.get_bilingual_str())
|
434 |
pass
|
|
|
193 |
|
194 |
def split_seg(self, seg, threshold):
|
195 |
# evenly split seg to 2 parts and add new seg into self.segments
|
196 |
+
if seg.source_text[:2] == ', ':
|
197 |
+
seg.source_text = seg.source_text[2:]
|
198 |
if seg.translation[0] == ',':
|
199 |
seg.translation = seg.translation[1:]
|
200 |
source_text = seg.source_text
|
|
|
205 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
206 |
else:
|
207 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
208 |
+
if len(src_space) > 0:
|
209 |
+
src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
|
210 |
+
else:
|
211 |
+
src_split_idx = 0
|
212 |
|
213 |
if len(trans_commas) != 0:
|
214 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
|
|
251 |
|
252 |
|
253 |
def check_len_and_split(self, threshold=30):
|
254 |
+
# DEPRECATED
|
255 |
+
# if sentence length >= threshold, split this segments to two
|
256 |
segments = []
|
257 |
for seg in self.segments:
|
258 |
if len(seg.translation) > threshold:
|
|
|
266 |
pass
|
267 |
|
268 |
def check_len_and_split_range(self, range, threshold=30):
|
269 |
+
# if sentence length >= threshold, split this segments to two
|
270 |
start_seg_id = range[0]
|
271 |
end_seg_id = range[1]
|
272 |
+
extra_len = 0
|
273 |
segments = []
|
274 |
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
275 |
if len(seg.translation) > threshold:
|
276 |
seg_list = self.split_seg(seg, threshold)
|
277 |
segments += seg_list
|
278 |
+
extra_len += len(seg_list) - 1
|
279 |
else:
|
280 |
segments.append(seg)
|
281 |
|
282 |
self.segments[start_seg_id-1:end_seg_id] = segments
|
283 |
|
284 |
+
return extra_len
|
285 |
|
286 |
def get_source_only(self):
|
287 |
# return a string with pure source text
|
|
|
425 |
start_seg_id = range[0]
|
426 |
end_seg_id = range[1]
|
427 |
with open(path, "a", encoding='utf-8') as f:
|
428 |
+
# for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
|
429 |
+
# f.write(f'{i+idx}\n')
|
430 |
+
# f.write(seg.get_trans_str())
|
431 |
+
for i, seg in enumerate(self.segments):
|
432 |
+
if i<range[0]-1: continue
|
433 |
+
if i>=range[1] + length:break
|
434 |
f.write(f'{i+idx}\n')
|
435 |
f.write(seg.get_trans_str())
|
436 |
pass
|
|
|
439 |
start_seg_id = range[0]
|
440 |
end_seg_id = range[1]
|
441 |
with open(path, "a", encoding='utf-8') as f:
|
442 |
+
for i, seg in enumerate(self.segments):
|
443 |
+
if i<range[0]-1: continue
|
444 |
+
if i>=range[1] + length:break
|
445 |
f.write(f'{i+idx}\n')
|
446 |
f.write(seg.get_bilingual_str())
|
447 |
pass
|
pipeline.py
CHANGED
@@ -47,8 +47,8 @@ if args.video_name == 'placeholder' :
|
|
47 |
VIDEO_NAME = args.audio_file.split('/')[-1].split('.')[0]
|
48 |
elif args.srt_file is not None:
|
49 |
VIDEO_NAME = args.srt_file.split('/')[-1].split('.')[0]
|
50 |
-
else:
|
51 |
-
|
52 |
|
53 |
model_name = args.model_name
|
54 |
|
@@ -260,12 +260,12 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
|
|
260 |
flag = True
|
261 |
# add read-time output back and modify the post-processing by using one batch as an unit.
|
262 |
srt.set_translation(translate, range, model_name)
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
|
267 |
-
srt.check_len_and_split()
|
268 |
-
srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
|
269 |
# srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
|
270 |
|
271 |
if not args.only_srt:
|
|
|
47 |
VIDEO_NAME = args.audio_file.split('/')[-1].split('.')[0]
|
48 |
elif args.srt_file is not None:
|
49 |
VIDEO_NAME = args.srt_file.split('/')[-1].split('.')[0]
|
50 |
+
else:
|
51 |
+
VIDEO_NAME = args.video_name
|
52 |
|
53 |
model_name = args.model_name
|
54 |
|
|
|
260 |
flag = True
|
261 |
# add read-time output back and modify the post-processing by using one batch as an unit.
|
262 |
srt.set_translation(translate, range, model_name)
|
263 |
+
add_length = srt.check_len_and_split_range(range)
|
264 |
+
srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
|
265 |
+
srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
|
266 |
|
267 |
+
# srt.check_len_and_split()
|
268 |
+
# srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
|
269 |
# srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
|
270 |
|
271 |
if not args.only_srt:
|