JiaenLiu commited on
Commit
e3825f8
1 Parent(s): 1e2d254

add batch output and continue translate function

Browse files

Former-commit-id: 9163523f8449d647b6b1fa0da45ef72afa4ff0c3

Files changed (2) hide show
  1. SRT.py +38 -3
  2. pipeline.py +60 -23
SRT.py CHANGED
@@ -3,6 +3,7 @@ from csv import reader
3
  from datetime import datetime
4
  import re
5
  import openai
 
6
 
7
  class SRT_segment(object):
8
  def __init__(self, *args) -> None:
@@ -180,7 +181,7 @@ class SRT_script():
180
  #print(lines[i])
181
  pass
182
 
183
- def split_seg(self, seg, threshold):
184
  # TODO: evenly split seg to 2 parts and add new seg into self.segments
185
  source_text = seg.source_text
186
  translation = seg.translation
@@ -193,7 +194,7 @@ class SRT_script():
193
  src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
194
 
195
  if len(trans_commas) != 0:
196
- trans_split_idx = trans_commas[len(src_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
197
  else:
198
  trans_split_idx = len(translation)//2
199
 
@@ -246,6 +247,22 @@ class SRT_script():
246
 
247
  pass
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  def get_source_only(self):
250
  # return a string with pure source text
251
  result = ""
@@ -383,4 +400,22 @@ class SRT_script():
383
  else:
384
  real_word = word.lower()
385
  n = 0
386
- return real_word, len(word)+n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from datetime import datetime
4
  import re
5
  import openai
6
+ from collections import deque
7
 
8
  class SRT_segment(object):
9
  def __init__(self, *args) -> None:
 
181
  #print(lines[i])
182
  pass
183
 
184
+ def split_seg(self, seg, threshold=500):
185
  # TODO: evenly split seg to 2 parts and add new seg into self.segments
186
  source_text = seg.source_text
187
  translation = seg.translation
 
194
  src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
195
 
196
  if len(trans_commas) != 0:
197
+ trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
198
  else:
199
  trans_split_idx = len(translation)//2
200
 
 
247
 
248
  pass
249
 
250
+ def check_len_and_split_range(self, range, threshold=30):
251
+ # TODO: if sentence length >= threshold, split this segments to two
252
+ start_seg_id = range[0]
253
+ end_seg_id = range[1]
254
+ segments = []
255
+ for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
256
+ if len(seg.translation) > threshold:
257
+ seg_list = self.split_seg(seg, threshold)
258
+ segments += seg_list
259
+ else:
260
+ segments.append(seg)
261
+
262
+ self.segments[start_seg_id-1:end_seg_id] = segments
263
+
264
+ return len(segments)
265
+
266
  def get_source_only(self):
267
  # return a string with pure source text
268
  result = ""
 
400
  else:
401
  real_word = word.lower()
402
  n = 0
403
+ return real_word, len(word)+n
404
+
405
+ def realtime_write_srt(self,path,range,length,idx):
406
+ start_seg_id = range[0]
407
+ end_seg_id = range[1]
408
+ with open(path, "a", encoding='utf-8') as f:
409
+ for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
410
+ f.write(f'{i+idx}\n')
411
+ f.write(seg.get_trans_str())
412
+ pass
413
+
414
+ def realtime_bilingual_write_srt(self,path,range,length,idx):
415
+ start_seg_id = range[0]
416
+ end_seg_id = range[1]
417
+ with open(path, "a", encoding='utf-8') as f:
418
+ for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id+length]):
419
+ f.write(f'{i+idx}\n')
420
+ f.write(seg.get_bilingual_str())
421
+ pass
pipeline.py CHANGED
@@ -52,6 +52,8 @@ else:
52
 
53
  model_name = args.model_name
54
 
 
 
55
  # get source audio
56
  if args.link is not None and args.video_file is None:
57
  # Download audio from YouTube
@@ -140,7 +142,7 @@ else:
140
  else:
141
  srt = SRT_script.parse_from_srt_file(srt_file_en)
142
 
143
- # srt preprocess
144
  srt.form_whole_sentence()
145
  srt.spell_check_term()
146
  srt.correct_with_force_term()
@@ -179,8 +181,38 @@ def script_split(script_in, chunk_size = 1000):
179
  return script_arr, range_arr
180
 
181
  script_arr, range_arr = script_split(script_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- def get_response(model_name):
184
  if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
185
  # print(s + "\n")
186
  response = openai.ChatCompletion.create(
@@ -189,48 +221,53 @@ def get_response(model_name):
189
  {"role": "system", "content": "You are a helpful assistant that translates English to Chinese and have decent background in starcraft2."},
190
  {"role": "system", "content": "Your translation has to keep the orginal format and be as accurate as possible."},
191
  {"role": "system", "content": "There is no need for you to add any comments or notes."},
192
- {"role": "user", "content": 'Translate the following English text to Chinese: "{}"'.format(s)}
193
  ],
194
  temperature=0.15
195
  )
196
 
197
  return response['choices'][0]['message']['content'].strip()
198
-
199
- if model_name == "text-davinci-003":
200
- prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
201
- # print(prompt)
202
- response = openai.Completion.create(
203
- model=model_name,
204
- prompt=prompt,
205
- temperature=0.1,
206
- max_tokens=2000,
207
- top_p=1.0,
208
- frequency_penalty=0.0,
209
- presence_penalty=0.0
210
- )
211
- return response['choices'][0]['text'].strip()
212
  pass
213
 
214
 
215
  # Translate and save
216
- for s, range in tqdm(zip(script_arr, range_arr)):
217
  # using chatgpt model
218
  print(f"now translating sentences {range}")
219
  flag = True
220
  while flag:
221
  flag = False
222
  try:
223
- translate = get_response(model_name)
224
  except Exception as e:
225
  print("An error has occurred during translation:",e)
226
- print("Retrying...")
227
  time.sleep(30)
228
  flag = True
 
 
229
  srt.set_translation(translate, range, model_name)
 
 
 
230
 
231
- srt.check_len_and_split()
232
- srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
233
- srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
234
 
235
  if not args.only_srt:
236
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
 
52
 
53
  model_name = args.model_name
54
 
55
+ threshold = 30
56
+
57
  # get source audio
58
  if args.link is not None and args.video_file is None:
59
  # Download audio from YouTube
 
142
  else:
143
  srt = SRT_script.parse_from_srt_file(srt_file_en)
144
 
145
+ # srt class preprocess
146
  srt.form_whole_sentence()
147
  srt.spell_check_term()
148
  srt.correct_with_force_term()
 
181
  return script_arr, range_arr
182
 
183
  script_arr, range_arr = script_split(script_input)
184
+ # print(script_arr, range_arr)
185
+
186
+ # check whether previous translation is done
187
+ zh_file = "{}/{}/{}_zh.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
188
+ segidx = 1
189
+ if os.path.exists(zh_file):
190
+ temp_file = "{}/{}/temp.srt".format(RESULT_PATH, VIDEO_NAME)
191
+ if os.path.exists(temp_file):
192
+ os.remove(temp_file)
193
+ with open(zh_file, "r") as f0:
194
+ for count, _ in enumerate(f0):
195
+ pass
196
+ count += 1
197
+ segidx = int(count/4)+1
198
+
199
+ with open("{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME), "r") as f1, open(temp_file, "a") as f2:
200
+ x = f1.readlines()
201
+ #print(len(x))
202
+ if count >= len(x):
203
+ print('Work already done! Please delete {}_zh.srt files in result directory first in order to rework'.format(VIDEO_NAME))
204
+ exit()
205
+ for i, line in enumerate(x):
206
+ if i >= count:
207
+ #print(i)
208
+ f2.write(line)
209
+
210
+ srt = SRT_script.parse_from_srt_file(temp_file)
211
+ print('temp_contents')
212
+ print(srt.get_source_only())
213
+
214
 
215
+ def get_response(model_name, sentence):
216
  if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
217
  # print(s + "\n")
218
  response = openai.ChatCompletion.create(
 
221
  {"role": "system", "content": "You are a helpful assistant that translates English to Chinese and have decent background in starcraft2."},
222
  {"role": "system", "content": "Your translation has to keep the orginal format and be as accurate as possible."},
223
  {"role": "system", "content": "There is no need for you to add any comments or notes."},
224
+ {"role": "user", "content": 'Translate the following English text to Chinese: "{}"'.format(sentence)}
225
  ],
226
  temperature=0.15
227
  )
228
 
229
  return response['choices'][0]['message']['content'].strip()
230
+
231
+ # if model_name == "text-davinci-003":
232
+ # prompt = f"Please help me translate this into Chinese:\n\n{s}\n\n"
233
+ # # print(prompt)
234
+ # response = openai.Completion.create(
235
+ # model=model_name,
236
+ # prompt=prompt,
237
+ # temperature=0.1,
238
+ # max_tokens=2000,
239
+ # top_p=1.0,
240
+ # frequency_penalty=0.0,
241
+ # presence_penalty=0.0
242
+ # )
243
+ # return response['choices'][0]['text'].strip()
244
  pass
245
 
246
 
247
  # Translate and save
248
+ for sentence, range in tqdm(zip(script_arr, range_arr)):
249
  # using chatgpt model
250
  print(f"now translating sentences {range}")
251
  flag = True
252
  while flag:
253
  flag = False
254
  try:
255
+ translate = get_response(model_name, sentence)
256
  except Exception as e:
257
  print("An error has occurred during translation:",e)
258
+ print("Retrying... the script will continue after 30 seconds.")
259
  time.sleep(30)
260
  flag = True
261
+ # add read-time output back and modify the post-processing by using one batch as an unit.
262
+ print(translate)
263
  srt.set_translation(translate, range, model_name)
264
+ add_length = srt.check_len_and_split_range(range)
265
+ srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
266
+ srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
267
 
268
+ # srt.check_len_and_split()
269
+ # srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
270
+ # srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
271
 
272
  if not args.only_srt:
273
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")