Eason Lu commited on
Commit
f2c3799
2 Parent(s): e75254e 915e02d

solving split bug TODO: bilingual issue

Browse files

Former-commit-id: ca894904a230ea50f15ead328500ffb8a3ae6452

Files changed (2) hide show
  1. SRT.py +21 -8
  2. pipeline.py +7 -7
SRT.py CHANGED
@@ -193,8 +193,8 @@ class SRT_script():
193
 
194
  def split_seg(self, seg, threshold):
195
  # evenly split seg to 2 parts and add new seg into self.segments
196
- if seg.source_text[0] == ',':
197
- seg.source_text = seg.source_text[1:]
198
  if seg.translation[0] == ',':
199
  seg.translation = seg.translation[1:]
200
  source_text = seg.source_text
@@ -205,7 +205,10 @@ class SRT_script():
205
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
206
  else:
207
  src_space = [m.start() for m in re.finditer(' ', source_text)]
208
- src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
 
 
 
209
 
210
  if len(trans_commas) != 0:
211
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
@@ -248,7 +251,8 @@ class SRT_script():
248
 
249
 
250
  def check_len_and_split(self, threshold=30):
251
- # TODO: if sentence length >= threshold, split this segments to two
 
252
  segments = []
253
  for seg in self.segments:
254
  if len(seg.translation) > threshold:
@@ -262,20 +266,22 @@ class SRT_script():
262
  pass
263
 
264
  def check_len_and_split_range(self, range, threshold=30):
265
- # TODO: if sentence length >= threshold, split this segments to two
266
  start_seg_id = range[0]
267
  end_seg_id = range[1]
 
268
  segments = []
269
  for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
270
  if len(seg.translation) > threshold:
271
  seg_list = self.split_seg(seg, threshold)
272
  segments += seg_list
 
273
  else:
274
  segments.append(seg)
275
 
276
  self.segments[start_seg_id-1:end_seg_id] = segments
277
 
278
- return len(segments)
279
 
280
  def get_source_only(self):
281
  # return a string with pure source text
@@ -419,7 +425,12 @@ class SRT_script():
419
  start_seg_id = range[0]
420
  end_seg_id = range[1]
421
  with open(path, "a", encoding='utf-8') as f:
422
- for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
 
 
 
 
 
423
  f.write(f'{i+idx}\n')
424
  f.write(seg.get_trans_str())
425
  pass
@@ -428,7 +439,9 @@ class SRT_script():
428
  start_seg_id = range[0]
429
  end_seg_id = range[1]
430
  with open(path, "a", encoding='utf-8') as f:
431
- for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
 
 
432
  f.write(f'{i+idx}\n')
433
  f.write(seg.get_bilingual_str())
434
  pass
 
193
 
194
  def split_seg(self, seg, threshold):
195
  # evenly split seg to 2 parts and add new seg into self.segments
196
+ if seg.source_text[:2] == ', ':
197
+ seg.source_text = seg.source_text[2:]
198
  if seg.translation[0] == ',':
199
  seg.translation = seg.translation[1:]
200
  source_text = seg.source_text
 
205
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
206
  else:
207
  src_space = [m.start() for m in re.finditer(' ', source_text)]
208
+ if len(src_space) > 0:
209
+ src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
210
+ else:
211
+ src_split_idx = 0
212
 
213
  if len(trans_commas) != 0:
214
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
 
251
 
252
 
253
  def check_len_and_split(self, threshold=30):
254
+ # DEPRECATED
255
+ # if sentence length >= threshold, split this segments to two
256
  segments = []
257
  for seg in self.segments:
258
  if len(seg.translation) > threshold:
 
266
  pass
267
 
268
  def check_len_and_split_range(self, range, threshold=30):
269
+ # if sentence length >= threshold, split this segments to two
270
  start_seg_id = range[0]
271
  end_seg_id = range[1]
272
+ extra_len = 0
273
  segments = []
274
  for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
275
  if len(seg.translation) > threshold:
276
  seg_list = self.split_seg(seg, threshold)
277
  segments += seg_list
278
+ extra_len += len(seg_list) - 1
279
  else:
280
  segments.append(seg)
281
 
282
  self.segments[start_seg_id-1:end_seg_id] = segments
283
 
284
+ return extra_len
285
 
286
  def get_source_only(self):
287
  # return a string with pure source text
 
425
  start_seg_id = range[0]
426
  end_seg_id = range[1]
427
  with open(path, "a", encoding='utf-8') as f:
428
+ # for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
429
+ # f.write(f'{i+idx}\n')
430
+ # f.write(seg.get_trans_str())
431
+ for i, seg in enumerate(self.segments):
432
+ if i<range[0]-1: continue
433
+ if i>=range[1] + length:break
434
  f.write(f'{i+idx}\n')
435
  f.write(seg.get_trans_str())
436
  pass
 
439
  start_seg_id = range[0]
440
  end_seg_id = range[1]
441
  with open(path, "a", encoding='utf-8') as f:
442
+ for i, seg in enumerate(self.segments):
443
+ if i<range[0]-1: continue
444
+ if i>=range[1] + length:break
445
  f.write(f'{i+idx}\n')
446
  f.write(seg.get_bilingual_str())
447
  pass
pipeline.py CHANGED
@@ -47,8 +47,8 @@ if args.video_name == 'placeholder' :
47
  VIDEO_NAME = args.audio_file.split('/')[-1].split('.')[0]
48
  elif args.srt_file is not None:
49
  VIDEO_NAME = args.srt_file.split('/')[-1].split('.')[0]
50
- else:
51
- VIDEO_NAME = args.video_name
52
 
53
  model_name = args.model_name
54
 
@@ -260,12 +260,12 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
260
  flag = True
261
  # add read-time output back and modify the post-processing by using one batch as an unit.
262
  srt.set_translation(translate, range, model_name)
263
- # add_length = srt.check_len_and_split_range(range)
264
- # srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
265
- # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
266
 
267
- srt.check_len_and_split()
268
- srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
269
  # srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
270
 
271
  if not args.only_srt:
 
47
  VIDEO_NAME = args.audio_file.split('/')[-1].split('.')[0]
48
  elif args.srt_file is not None:
49
  VIDEO_NAME = args.srt_file.split('/')[-1].split('.')[0]
50
+ else:
51
+ VIDEO_NAME = args.video_name
52
 
53
  model_name = args.model_name
54
 
 
260
  flag = True
261
  # add read-time output back and modify the post-processing by using one batch as an unit.
262
  srt.set_translation(translate, range, model_name)
263
+ add_length = srt.check_len_and_split_range(range)
264
+ srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
265
+ srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
266
 
267
+ # srt.check_len_and_split()
268
+ # srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
269
  # srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
270
 
271
  if not args.only_srt: