Eason Lu commited on
Commit
3cc60a3
1 Parent(s): ea307e6

solving conflict

Browse files

Former-commit-id: f1806577d7055d0cc9b0f3a06cebc11d99a400f9

Files changed (2) hide show
  1. SRT.py +7 -5
  2. pipeline.py +9 -31
SRT.py CHANGED
@@ -109,7 +109,6 @@ class SRT_script():
109
 
110
  self.segments = segments # need memory release?
111
 
112
-
113
 
114
  def set_translation(self, translate:str, id_range:tuple, model, video_name, video_link=None):
115
  start_seg_id = id_range[0]
@@ -196,15 +195,18 @@ class SRT_script():
196
  seg.translation = lines[i].split(":" or ":")[1]
197
  except:
198
  seg.translation = lines[i]
199
- #print(lines[i])
200
- pass
201
 
202
  def split_seg(self, seg, threshold):
203
  # evenly split seg to 2 parts and add new seg into self.segments
204
- if seg.source_text[:2] == ', ':
205
- seg.source_text = seg.source_text[2:]
 
 
 
206
  if seg.translation[0] == ',':
207
  seg.translation = seg.translation[1:]
 
208
  source_text = seg.source_text
209
  translation = seg.translation
210
  src_commas = [m.start() for m in re.finditer(',', source_text)]
 
109
 
110
  self.segments = segments # need memory release?
111
 
 
112
 
113
  def set_translation(self, translate:str, id_range:tuple, model, video_name, video_link=None):
114
  start_seg_id = id_range[0]
 
195
  seg.translation = lines[i].split(":" or ":")[1]
196
  except:
197
  seg.translation = lines[i]
198
+
 
199
 
200
  def split_seg(self, seg, threshold):
201
  # evenly split seg to 2 parts and add new seg into self.segments
202
+
203
+ # ignore the initial comma to solve the recursion problem
204
+ if len(seg.source_text) > 2:
205
+ if seg.source_text[:2] == ', ':
206
+ seg.source_text = seg.source_text[2:]
207
  if seg.translation[0] == ',':
208
  seg.translation = seg.translation[1:]
209
+
210
  source_text = seg.source_text
211
  translation = seg.translation
212
  src_commas = [m.start() for m in re.finditer(',', source_text)]
pipeline.py CHANGED
@@ -133,24 +133,17 @@ else:
133
 
134
  # use stable-whisper
135
  model = stable_whisper.load_model('base')
136
- transcript = model.transcribe(audio_path, regroup = False)
137
- (
138
- transcript
139
- .split_by_punctuation(['.', '。', '?'])
140
- .merge_by_gap(.15, max_words=3)
141
- .merge_by_punctuation([' '])
142
- .split_by_punctuation(['.', '。', '?'])
143
- )
144
- # transcript.to_srt_vtt(srt_file_en)
145
  transcript = transcript.to_dict()
146
  srt = SRT_script(transcript['segments']) # read segments to SRT class
147
 
148
- #Write SRT file
149
-
150
- # from whisper.utils import WriteSRT
151
- # with open(srt_file_en, 'w', encoding="utf-8") as f:
152
- # writer = WriteSRT(RESULT_PATH)
153
- # writer.write_result(transcript, f)
154
  else:
155
  srt = SRT_script.parse_from_srt_file(srt_file_en)
156
 
@@ -241,21 +234,6 @@ def get_response(model_name, sentence):
241
  )
242
 
243
  return response['choices'][0]['message']['content'].strip()
244
-
245
- # if model_name == "text-davinci-003":
246
- # prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
247
- # # print(prompt)
248
- # response = openai.Completion.create(
249
- # model=model_name,
250
- # prompt=prompt,
251
- # temperature=0.1,
252
- # max_tokens=2000,
253
- # top_p=1.0,
254
- # frequency_penalty=0.0,
255
- # presence_penalty=0.0
256
- # )
257
- # return response['choices'][0]['text'].strip()
258
- pass
259
 
260
 
261
  # Translate and save
@@ -283,7 +261,7 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
283
 
284
  srt.check_len_and_split()
285
  srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
286
- # srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
287
 
288
  if not args.only_srt:
289
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
 
133
 
134
  # use stable-whisper
135
  model = stable_whisper.load_model('base')
136
+ transcript = model.transcribe(audio_path)
137
+ # (
138
+ # transcript
139
+ # .split_by_punctuation(['.', '。', '?'])
140
+ # .merge_by_gap(.15, max_words=3)
141
+ # .merge_by_punctuation([' '])
142
+ # .split_by_punctuation(['.', '。', '?'])
143
+ # )
 
144
  transcript = transcript.to_dict()
145
  srt = SRT_script(transcript['segments']) # read segments to SRT class
146
 
 
 
 
 
 
 
147
  else:
148
  srt = SRT_script.parse_from_srt_file(srt_file_en)
149
 
 
234
  )
235
 
236
  return response['choices'][0]['message']['content'].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
 
239
  # Translate and save
 
261
 
262
  srt.check_len_and_split()
263
  srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
264
+ srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
265
 
266
  if not args.only_srt:
267
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")