worldqwq commited on
Commit
6273515
2 Parent(s): 6954766 b2ca465

Merge branch 'eason/refactor' into MergeFix

Browse files

Former-commit-id: 5cf3410f6825636ac35226749d95ec63b4af6072

SRT.py CHANGED
@@ -1,7 +1,6 @@
1
  from datetime import timedelta
2
- import os
3
- import whisper
4
  from csv import reader
 
5
  import re
6
  import openai
7
 
@@ -9,27 +8,41 @@ class SRT_segment(object):
9
  def __init__(self, *args) -> None:
10
  if isinstance(args[0], dict):
11
  segment = args[0]
12
- start_ms = int((segment['start']*100)%100*10)
13
- end_ms = int((segment['end']*100)%100*10)
14
- start_time = str(timedelta(seconds=int(segment['start']), milliseconds=start_ms))
15
- end_time = str(timedelta(seconds=int(segment['end']), milliseconds=end_ms))
16
- if start_ms == 0:
17
- self.start_time_str = str(0)+start_time.split('.')[0]+',000'
 
 
 
 
 
 
18
  else:
19
- self.start_time_str = str(0)+start_time.split('.')[0]+','+start_time.split('.')[1][:3]
20
- if end_ms == 0:
21
- self.end_time_str = str(0)+end_time.split('.')[0]+',000'
22
  else:
23
- self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
24
- self.source_text = segment['text'][1:]
25
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
26
  self.translation = ""
27
 
28
  elif isinstance(args[0], list):
29
- self.source_text = args[0][2][:-1]
30
  self.duration = args[0][1]
31
  self.start_time_str = self.duration.split(" --> ")[0]
32
  self.end_time_str = self.duration.split(" --> ")[1]
 
 
 
 
 
 
 
 
33
  self.translation = ""
34
 
35
  def merge_seg(self, seg):
@@ -64,7 +77,7 @@ class SRT_script():
64
  for i in range(len(script_lines)):
65
  if i % 4 == 0:
66
  segments.append(list(script_lines[i:i+4]))
67
-
68
  return cls(segments)
69
 
70
  def merge_segs(self, idx_list) -> SRT_segment:
@@ -152,12 +165,70 @@ class SRT_script():
152
  #print(lines[i])
153
  pass
154
 
155
- def split_seg(self, seg_id):
156
  # TODO: evenly split seg to 2 parts and add new seg into self.segments
157
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- def check_len_and_split(self, threshold):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  # TODO: if sentence length >= threshold, split this segments to two
 
 
 
 
 
 
 
 
 
 
161
  pass
162
 
163
  def get_source_only(self):
@@ -211,29 +282,90 @@ class SRT_script():
211
  # TODO: variety of translation
212
 
213
  # load term dictionary
214
- with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
215
- csv_reader = reader(f)
216
- term_dict = {rows[0]:rows[1] for rows in csv_reader}
217
 
218
  # change term
219
  for seg in self.segments:
220
- ready_words = re.sub('\n', '\n ', seg.source_text).split(" ")
221
  for i in range(len(ready_words)):
222
  word = ready_words[i]
223
- if word[-2:] == ".\n" :
224
- if word[:-2].lower() in term_dict :
225
- new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
226
- ready_words[i] = new_word
227
- else:
228
- ready_words[i] = word + ' '
229
- elif word.lower() in term_dict :
230
- new_word = word.replace(word,term_dict.get(word.lower())) + ' '
231
- ready_words[i] = new_word
232
- else :
233
- ready_words[i]= word + ' '
234
- seg.source_text = re.sub('\n ', '\n', "".join(ready_words))
235
  pass
236
 
237
-
 
 
 
 
 
 
238
 
239
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datetime import timedelta
 
 
2
  from csv import reader
3
+ from datetime import datetime
4
  import re
5
  import openai
6
 
 
8
  def __init__(self, *args) -> None:
9
  if isinstance(args[0], dict):
10
  segment = args[0]
11
+ self.start = segment['start']
12
+ self.end = segment['end']
13
+ self.start_ms = int((segment['start']*100)%100*10)
14
+ self.end_ms = int((segment['end']*100)%100*10)
15
+
16
+ if self.start_ms == self.end_ms and int(segment['start']) == int(segment['end']): # avoid empty time stamp
17
+ self.end_ms+=500
18
+
19
+ self.start_time = timedelta(seconds=int(segment['start']), milliseconds=self.start_ms)
20
+ self.end_time = timedelta(seconds=int(segment['end']), milliseconds=self.end_ms)
21
+ if self.start_ms == 0:
22
+ self.start_time_str = str(0)+str(self.start_time).split('.')[0]+',000'
23
  else:
24
+ self.start_time_str = str(0)+str(self.start_time).split('.')[0]+','+str(self.start_time).split('.')[1][:3]
25
+ if self.end_ms == 0:
26
+ self.end_time_str = str(0)+str(self.end_time).split('.')[0]+',000'
27
  else:
28
+ self.end_time_str = str(0)+str(self.end_time).split('.')[0]+','+str(self.end_time).split('.')[1][:3]
29
+ self.source_text = segment['text']
30
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
31
  self.translation = ""
32
 
33
  elif isinstance(args[0], list):
34
+ self.source_text = args[0][2]
35
  self.duration = args[0][1]
36
  self.start_time_str = self.duration.split(" --> ")[0]
37
  self.end_time_str = self.duration.split(" --> ")[1]
38
+
39
+ # parse the time to float
40
+ self.start_ms = int(self.start_time_str.split(',')[1])/10
41
+ self.end_ms = int(self.end_time_str.split(',')[1])/10
42
+ start_list = self.start_time_str.split(',')[0].split(':')
43
+ self.start = int(start_list[0])*3600 + int(start_list[1])*60 + int(start_list[2]) + self.start_ms/100
44
+ end_list = self.end_time_str.split(',')[0].split(':')
45
+ self.end = int(end_list[0])*3600 + int(end_list[1])*60 + int(end_list[2]) + self.end_ms/100
46
  self.translation = ""
47
 
48
  def merge_seg(self, seg):
 
77
  for i in range(len(script_lines)):
78
  if i % 4 == 0:
79
  segments.append(list(script_lines[i:i+4]))
80
+
81
  return cls(segments)
82
 
83
  def merge_segs(self, idx_list) -> SRT_segment:
 
165
  #print(lines[i])
166
  pass
167
 
168
+ def split_seg(self, seg, threshold):
169
  # TODO: evenly split seg to 2 parts and add new seg into self.segments
170
+ source_text = seg.source_text
171
+ translation = seg.translation
172
+ src_commas = [m.start() for m in re.finditer(',', source_text)]
173
+ trans_commas = [m.start() for m in re.finditer(',', translation)]
174
+ if len(src_commas) != 0:
175
+ src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
176
+ else:
177
+ src_space = [m.start() for m in re.finditer(' ', source_text)]
178
+ src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
179
+
180
+ if len(trans_commas) != 0:
181
+ trans_split_idx = trans_commas[len(src_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
182
+ else:
183
+ trans_split_idx = len(translation)//2
184
+
185
+ src_seg1 = source_text[:src_split_idx]
186
+ src_seg2 = source_text[src_split_idx:]
187
+ trans_seg1 = translation[:trans_split_idx]
188
+ trans_seg2 = translation[trans_split_idx:]
189
+ start_seg1 = seg.start
190
+ end_seg1 = start_seg2 = seg.start + (seg.end - seg.start)/2
191
+ end_seg2 = seg.end
192
+ seg1_dict = {}
193
+ seg1_dict['text'] = src_seg1
194
+ seg1_dict['start'] = start_seg1
195
+ seg1_dict['end'] = end_seg1
196
+ seg1 = SRT_segment(seg1_dict)
197
+ seg1.translation = trans_seg1
198
 
199
+ seg2_dict = {}
200
+ seg2_dict['text'] = src_seg2
201
+ seg2_dict['start'] = start_seg2
202
+ seg2_dict['end'] = end_seg2
203
+ seg2 = SRT_segment(seg2_dict)
204
+ seg2.translation = trans_seg2
205
+
206
+ result_list = []
207
+ if len(seg1.translation) > threshold:
208
+ result_list += self.split_seg(seg1, threshold)
209
+ else:
210
+ result_list.append(seg1)
211
+
212
+ if len(seg2.translation) > threshold:
213
+ result_list += self.split_seg(seg2, threshold)
214
+ else:
215
+ result_list.append(seg2)
216
+
217
+ return result_list
218
+
219
+
220
+ def check_len_and_split(self, threshold=30):
221
  # TODO: if sentence length >= threshold, split this segments to two
222
+ segments = []
223
+ for seg in self.segments:
224
+ if len(seg.translation) > threshold:
225
+ seg_list = self.split_seg(seg, threshold)
226
+ segments += seg_list
227
+ else:
228
+ segments.append(seg)
229
+
230
+ self.segments = segments
231
+
232
  pass
233
 
234
  def get_source_only(self):
 
282
  # TODO: variety of translation
283
 
284
  # load term dictionary
285
+ with open("./finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
286
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
 
287
 
288
  # change term
289
  for seg in self.segments:
290
+ ready_words = seg.source_text.split(" ")
291
  for i in range(len(ready_words)):
292
  word = ready_words[i]
293
+ [real_word, pos] = self.get_real_word(word)
294
+ if real_word in term_enzh_dict:
295
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
296
+ else:
297
+ new_word = word
298
+ ready_words[i] = new_word
299
+ seg.source_text = " ".join(ready_words)
 
 
 
 
 
300
  pass
301
 
302
+ def spell_check_term(self):
303
+ ## known bug: I've will be replaced because i've is not in the dict
304
+
305
+
306
+ import enchant
307
+ dict = enchant.Dict('en_US')
308
+ term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
309
 
310
+ for seg in self.segments:
311
+ ready_words = seg.source_text.split(" ")
312
+ for i in range(len(ready_words)):
313
+ word = ready_words[i]
314
+ [real_word, pos] = self.get_real_word(word)
315
+ if not dict.check(real_word):
316
+ suggest = term_spellDict.suggest(real_word)
317
+ if suggest: # relax spell check
318
+ new_word = word.replace(word[:pos],suggest[0])
319
+ else:
320
+ new_word = word
321
+ ready_words[i] = new_word
322
+ seg.source_text = " ".join(ready_words)
323
+ pass
324
+
325
+ def spell_correction(self, word:str, arg:int):
326
+ try:
327
+ arg in [0,1]
328
+ except ValueError:
329
+ print('only 0 or 1 for argument')
330
+
331
+
332
+ def uncover(word:str):
333
+ if word[-2:] == ".\n":
334
+ real_word = word[:-2].lower()
335
+ n = -2
336
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
337
+ real_word = word[:-1].lower()
338
+ n = -1
339
+ else:
340
+ real_word = word.lower()
341
+ n = 0
342
+ return real_word, len(word)+n
343
+
344
+ real_word = uncover(word)[0]
345
+ pos = uncover(word)[1]
346
+ new_word = word
347
+ if arg == 0: # term translate mode
348
+ with open("finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
349
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
350
+ if real_word in term_enzh_dict:
351
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
352
+ elif arg == 1: # term spell check mode
353
+ import enchant
354
+ dict = enchant.Dict('en_US')
355
+ term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
356
+ if not dict.check(real_word):
357
+ if term_spellDict.suggest(real_word): # relax spell check
358
+ new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
359
+ return new_word
360
+
361
+ def get_real_word(self, word:str):
362
+ if word[-2:] == ".\n":
363
+ real_word = word[:-2].lower()
364
+ n = -2
365
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
366
+ real_word = word[:-1].lower()
367
+ n = -1
368
+ else:
369
+ real_word = word.lower()
370
+ n = 0
371
+ return real_word, len(word)+n
finetune_data/{dict.csv → dict_enzh.csv} RENAMED
@@ -1,4 +1,4 @@
1
- barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
 
1
+ barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
finetune_data/dict_freq.csv ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks,1
2
+ engineering bay,1
3
+ forge,1
4
+ blink,1
5
+ evolution chamber,1
6
+ cybernetics core,1
7
+ enhanced shockwaves,1
8
+ gravitic boosters,1
9
+ armory,1
10
+ robotics bay,1
11
+ twilight council,1
12
+ fusion core,1
13
+ fleet beacon,1
14
+ factory,1
15
+ ghost academy,1
16
+ infestation pit,1
17
+ robotics facility,1
18
+ stargate,1
19
+ starport,1
20
+ archon,1
21
+ smart servos,1
22
+ gateway,1
23
+ warpgate,1
24
+ immortal,1
25
+ zealot,1
26
+ nydus network,1
27
+ nydus worm,1
28
+ hydralisk,1
29
+ grooved spines,1
30
+ muscular augments,1
31
+ hydralisk den,1
32
+ planetary fortress,1
33
+ battle cruiser,1
34
+ weapon refit,1
35
+ brood lord,1
36
+ greater spire,1
37
+ anabolic synthesis,1
38
+ cyclone,1
39
+ bunker,1
40
+ lurker,1
41
+ seismic spines,1
42
+ adaptive talons,1
43
+ lurker den,1
44
+ widow mine,1
45
+ ground carapace,1
46
+ high templar,1
47
+ shield battery,1
48
+ observer,1
49
+ baneling,1
50
+ centrifugal hooks,1
51
+ baneling nest,1
52
+ raven,1
53
+ combat shield,1
54
+ shield,1
55
+ lair,1
56
+ missile turret,1
57
+ spore crawler,1
58
+ supply depot,1
59
+ overlord,1
60
+ pneumatized carapace,1
61
+ mutalisk,1
62
+ spire,1
63
+ viper,1
64
+ flyer attacks,1
65
+ flyer carapace,1
66
+ tempest,1
67
+ tectonic destabilizers,1
68
+ phoenix,1
69
+ anion pulse-crystals,1
70
+ corruptor,1
71
+ infestor,1
72
+ pathogen glands,1
73
+ zergling,1
74
+ spawning pool,1
75
+ metabolic boost,1
76
+ spine crawler,1
77
+ marauder,1
78
+ ghost,1
79
+ arm silo with nuke,1
80
+ carrier,1
81
+ hellion,1
82
+ hellbat,1
83
+ ravager,1
84
+ nexus,1
85
+ hatchery,1
86
+ command center,1
87
+ neosteel armor,1
88
+ hi-sec auto tracking,1
89
+ ship weapons,1
90
+ charge,1
91
+ liberator,1
92
+ advanced ballistics,1
93
+ melee attacks,1
94
+ colossus,1
95
+ extended thermal lance,1
96
+ creep tumor,1
97
+ tech lab,1
98
+ air armor,1
99
+ air weapons,1
100
+ adrenal glands,1
101
+ mule,1
102
+ infernal pre-igniter,1
103
+ thor,1
104
+ warp prism,1
105
+ gravitic drive,1
106
+ dragoon,1
107
+ cocoon,1
108
+ larva,1
109
+ mothership,1
110
+ burrow,1
111
+ changeling,1
112
+ ultralisk,1
113
+ chitinous plating,1
114
+ ultralisk cavern,1
115
+ drone,1
116
+ scv,1
117
+ queen,1
118
+ banshee,1
119
+ hyperflight rotors,1
120
+ photon cannon,1
121
+ missile attacks,1
122
+ assimilator,1
123
+ extractor,1
124
+ refinery,1
125
+ roach,1
126
+ marine,1
127
+ sensor tower,1
128
+ infantry armor,1
129
+ infantry weapons,1
130
+ hive,1
131
+ psionic storm,1
132
+ templar archives,1
133
+ sentry,1
134
+ ground armor,1
135
+ ground weapons,1
136
+ adept,1
137
+ resonating glaives,1
138
+ reactor,1
139
+ pylon,1
140
+ reaper,1
141
+ drilling claws,1
142
+ swarm host,1
143
+ mag-field accelerator,1
144
+ siege tank,1
145
+ probe,1
146
+ corvid reactor,1
147
+ neural parasite,1
148
+ viking,1
149
+ oracle,1
150
+ broodling,1
151
+ locust,1
152
+ mothership core,1
153
+ orbital command,1
154
+ stimpack,1
155
+ void ray,1
156
+ flux vanes,1
157
+ overseer,1
158
+ ignite afterburners,1
159
+ dark templar,1
160
+ shadow stride,1
161
+ dark shrine,1
162
+ cloaking field,1
163
+ personal cloaking,1
164
+ medivac dropship,1
165
+ vehicle and ship plating,1
166
+ vehicle weapons,1
167
+ war hound,1
168
+ roach warren,1
169
+ tunneling claws,1
170
+ glial reconstitution,1
171
+ concussive shells,1
172
+ stalker,1
173
+ disruptor,1
174
+ zerg,1
175
+ protross,1
176
+ terran,1
finetune_data/dict_freq.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks
2
+ engineering bay
3
+ forge
4
+ blink
5
+ evolution chamber
6
+ cybernetics core
7
+ enhanced shockwaves
8
+ gravitic boosters
9
+ armory
10
+ robotics bay
11
+ twilight council
12
+ fusion core
13
+ fleet beacon
14
+ factory
15
+ ghost academy
16
+ infestation pit
17
+ robotics facility
18
+ stargate
19
+ starport
20
+ archon
21
+ smart servos
22
+ gateway
23
+ warpgate
24
+ immortal
25
+ zealot
26
+ nydus network
27
+ nydus worm
28
+ hydralisk
29
+ grooved spines
30
+ muscular augments
31
+ hydralisk den
32
+ planetary fortress
33
+ battle cruiser
34
+ weapon refit
35
+ brood lord
36
+ greater spire
37
+ anabolic synthesis
38
+ cyclone
39
+ bunker
40
+ lurker
41
+ seismic spines
42
+ adaptive talons
43
+ lurker den
44
+ widow mine
45
+ ground carapace
46
+ high templar
47
+ shield battery
48
+ observer
49
+ baneling
50
+ centrifugal hooks
51
+ baneling nest
52
+ raven
53
+ combat shield
54
+ shield
55
+ lair
56
+ missile turret
57
+ spore crawler
58
+ supply depot
59
+ overlord
60
+ pneumatized carapace
61
+ mutalisk
62
+ spire
63
+ viper
64
+ flyer attacks
65
+ flyer carapace
66
+ tempest
67
+ tectonic destabilizers
68
+ phoenix
69
+ anion pulse-crystals
70
+ corruptor
71
+ infestor
72
+ pathogen glands
73
+ zergling
74
+ spawning pool
75
+ metabolic boost
76
+ spine crawler
77
+ marauder
78
+ ghost
79
+ arm silo with nuke
80
+ carrier
81
+ hellion
82
+ hellbat
83
+ ravager
84
+ nexus
85
+ hatchery
86
+ command center
87
+ neosteel armor
88
+ hi-sec auto tracking
89
+ ship weapons
90
+ charge
91
+ liberator
92
+ advanced ballistics
93
+ melee attacks
94
+ colossus
95
+ extended thermal lance
96
+ creep tumor
97
+ tech lab
98
+ air armor
99
+ air weapons
100
+ adrenal glands
101
+ mule
102
+ infernal pre-igniter
103
+ thor
104
+ warp prism
105
+ gravitic drive
106
+ dragoon
107
+ cocoon
108
+ larva
109
+ mothership
110
+ burrow
111
+ changeling
112
+ ultralisk
113
+ chitinous plating
114
+ ultralisk cavern
115
+ drone
116
+ scv
117
+ queen
118
+ banshee
119
+ hyperflight rotors
120
+ photon cannon
121
+ missile attacks
122
+ assimilator
123
+ extractor
124
+ refinery
125
+ roach
126
+ marine
127
+ sensor tower
128
+ infantry armor
129
+ infantry weapons
130
+ hive
131
+ psionic storm
132
+ templar archives
133
+ sentry
134
+ ground armor
135
+ ground weapons
136
+ adept
137
+ resonating glaives
138
+ reactor
139
+ pylon
140
+ reaper
141
+ drilling claws
142
+ swarm host
143
+ mag-field accelerator
144
+ siege tank
145
+ probe
146
+ corvid reactor
147
+ neural parasite
148
+ viking
149
+ oracle
150
+ broodling
151
+ locust
152
+ mothership core
153
+ orbital command
154
+ stimpack
155
+ void ray
156
+ flux vanes
157
+ overseer
158
+ ignite afterburners
159
+ dark templar
160
+ shadow stride
161
+ dark shrine
162
+ cloaking field
163
+ personal cloaking
164
+ medivac dropship
165
+ vehicle and ship plating
166
+ vehicle weapons
167
+ war hound
168
+ roach warren
169
+ tunneling claws
170
+ glial reconstitution
171
+ concussive shells
172
+ stalker
173
+ disruptor
174
+ zerg
175
+ protross
176
+ terran
177
+ starcraft
pipeline.py CHANGED
@@ -2,7 +2,6 @@ import openai
2
  from pytube import YouTube
3
  import argparse
4
  import os
5
- import whisper
6
  from tqdm import tqdm
7
  from SRT import SRT_script
8
  import stable_whisper
@@ -126,6 +125,7 @@ else:
126
 
127
  # srt preprocess
128
  srt.form_whole_sentence()
 
129
  srt.correct_with_force_term()
130
  srt.write_srt_file_src(srt_file_en)
131
  script_input = srt.get_source_only()
@@ -198,7 +198,9 @@ for s, range in tqdm(zip(script_arr, range_arr)):
198
 
199
  srt.set_translation(translate, range)
200
 
 
201
  srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
 
202
 
203
  if not args.only_srt:
204
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
 
2
  from pytube import YouTube
3
  import argparse
4
  import os
 
5
  from tqdm import tqdm
6
  from SRT import SRT_script
7
  import stable_whisper
 
125
 
126
  # srt preprocess
127
  srt.form_whole_sentence()
128
+ srt.spell_check_term()
129
  srt.correct_with_force_term()
130
  srt.write_srt_file_src(srt_file_en)
131
  script_input = srt.get_source_only()
 
198
 
199
  srt.set_translation(translate, range)
200
 
201
+ srt.check_len_and_split()
202
  srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
203
+ srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
204
 
205
  if not args.only_srt:
206
  assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")