Spaces:
Sleeping
Sleeping
Merge branch 'eason/refactor' into MergeFix
Browse filesFormer-commit-id: 5cf3410f6825636ac35226749d95ec63b4af6072
- SRT.py +168 -36
- finetune_data/{dict.csv → dict_enzh.csv} +1 -1
- finetune_data/dict_freq.csv +176 -0
- finetune_data/dict_freq.txt +177 -0
- pipeline.py +3 -1
SRT.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
from datetime import timedelta
|
2 |
-
import os
|
3 |
-
import whisper
|
4 |
from csv import reader
|
|
|
5 |
import re
|
6 |
import openai
|
7 |
|
@@ -9,27 +8,41 @@ class SRT_segment(object):
|
|
9 |
def __init__(self, *args) -> None:
|
10 |
if isinstance(args[0], dict):
|
11 |
segment = args[0]
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
else:
|
19 |
-
self.start_time_str = str(0)+start_time.split('.')[0]+','+start_time.split('.')[1][:3]
|
20 |
-
if end_ms == 0:
|
21 |
-
self.end_time_str = str(0)+end_time.split('.')[0]+',000'
|
22 |
else:
|
23 |
-
self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
|
24 |
-
self.source_text = segment['text']
|
25 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
26 |
self.translation = ""
|
27 |
|
28 |
elif isinstance(args[0], list):
|
29 |
-
self.source_text = args[0][2]
|
30 |
self.duration = args[0][1]
|
31 |
self.start_time_str = self.duration.split(" --> ")[0]
|
32 |
self.end_time_str = self.duration.split(" --> ")[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
self.translation = ""
|
34 |
|
35 |
def merge_seg(self, seg):
|
@@ -64,7 +77,7 @@ class SRT_script():
|
|
64 |
for i in range(len(script_lines)):
|
65 |
if i % 4 == 0:
|
66 |
segments.append(list(script_lines[i:i+4]))
|
67 |
-
|
68 |
return cls(segments)
|
69 |
|
70 |
def merge_segs(self, idx_list) -> SRT_segment:
|
@@ -152,12 +165,70 @@ class SRT_script():
|
|
152 |
#print(lines[i])
|
153 |
pass
|
154 |
|
155 |
-
def split_seg(self,
|
156 |
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
# TODO: if sentence length >= threshold, split this segments to two
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
pass
|
162 |
|
163 |
def get_source_only(self):
|
@@ -211,29 +282,90 @@ class SRT_script():
|
|
211 |
# TODO: variety of translation
|
212 |
|
213 |
# load term dictionary
|
214 |
-
with open("finetune_data/
|
215 |
-
|
216 |
-
term_dict = {rows[0]:rows[1] for rows in csv_reader}
|
217 |
|
218 |
# change term
|
219 |
for seg in self.segments:
|
220 |
-
ready_words =
|
221 |
for i in range(len(ready_words)):
|
222 |
word = ready_words[i]
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
new_word = word.replace(word,term_dict.get(word.lower())) + ' '
|
231 |
-
ready_words[i] = new_word
|
232 |
-
else :
|
233 |
-
ready_words[i]= word + ' '
|
234 |
-
seg.source_text = re.sub('\n ', '\n', "".join(ready_words))
|
235 |
pass
|
236 |
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from datetime import timedelta
|
|
|
|
|
2 |
from csv import reader
|
3 |
+
from datetime import datetime
|
4 |
import re
|
5 |
import openai
|
6 |
|
|
|
8 |
def __init__(self, *args) -> None:
|
9 |
if isinstance(args[0], dict):
|
10 |
segment = args[0]
|
11 |
+
self.start = segment['start']
|
12 |
+
self.end = segment['end']
|
13 |
+
self.start_ms = int((segment['start']*100)%100*10)
|
14 |
+
self.end_ms = int((segment['end']*100)%100*10)
|
15 |
+
|
16 |
+
if self.start_ms == self.end_ms and int(segment['start']) == int(segment['end']): # avoid empty time stamp
|
17 |
+
self.end_ms+=500
|
18 |
+
|
19 |
+
self.start_time = timedelta(seconds=int(segment['start']), milliseconds=self.start_ms)
|
20 |
+
self.end_time = timedelta(seconds=int(segment['end']), milliseconds=self.end_ms)
|
21 |
+
if self.start_ms == 0:
|
22 |
+
self.start_time_str = str(0)+str(self.start_time).split('.')[0]+',000'
|
23 |
else:
|
24 |
+
self.start_time_str = str(0)+str(self.start_time).split('.')[0]+','+str(self.start_time).split('.')[1][:3]
|
25 |
+
if self.end_ms == 0:
|
26 |
+
self.end_time_str = str(0)+str(self.end_time).split('.')[0]+',000'
|
27 |
else:
|
28 |
+
self.end_time_str = str(0)+str(self.end_time).split('.')[0]+','+str(self.end_time).split('.')[1][:3]
|
29 |
+
self.source_text = segment['text']
|
30 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
31 |
self.translation = ""
|
32 |
|
33 |
elif isinstance(args[0], list):
|
34 |
+
self.source_text = args[0][2]
|
35 |
self.duration = args[0][1]
|
36 |
self.start_time_str = self.duration.split(" --> ")[0]
|
37 |
self.end_time_str = self.duration.split(" --> ")[1]
|
38 |
+
|
39 |
+
# parse the time to float
|
40 |
+
self.start_ms = int(self.start_time_str.split(',')[1])/10
|
41 |
+
self.end_ms = int(self.end_time_str.split(',')[1])/10
|
42 |
+
start_list = self.start_time_str.split(',')[0].split(':')
|
43 |
+
self.start = int(start_list[0])*3600 + int(start_list[1])*60 + int(start_list[2]) + self.start_ms/100
|
44 |
+
end_list = self.end_time_str.split(',')[0].split(':')
|
45 |
+
self.end = int(end_list[0])*3600 + int(end_list[1])*60 + int(end_list[2]) + self.end_ms/100
|
46 |
self.translation = ""
|
47 |
|
48 |
def merge_seg(self, seg):
|
|
|
77 |
for i in range(len(script_lines)):
|
78 |
if i % 4 == 0:
|
79 |
segments.append(list(script_lines[i:i+4]))
|
80 |
+
|
81 |
return cls(segments)
|
82 |
|
83 |
def merge_segs(self, idx_list) -> SRT_segment:
|
|
|
165 |
#print(lines[i])
|
166 |
pass
|
167 |
|
168 |
+
def split_seg(self, seg, threshold):
|
169 |
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
170 |
+
source_text = seg.source_text
|
171 |
+
translation = seg.translation
|
172 |
+
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
173 |
+
trans_commas = [m.start() for m in re.finditer(',', translation)]
|
174 |
+
if len(src_commas) != 0:
|
175 |
+
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
176 |
+
else:
|
177 |
+
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
178 |
+
src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
|
179 |
+
|
180 |
+
if len(trans_commas) != 0:
|
181 |
+
trans_split_idx = trans_commas[len(src_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
182 |
+
else:
|
183 |
+
trans_split_idx = len(translation)//2
|
184 |
+
|
185 |
+
src_seg1 = source_text[:src_split_idx]
|
186 |
+
src_seg2 = source_text[src_split_idx:]
|
187 |
+
trans_seg1 = translation[:trans_split_idx]
|
188 |
+
trans_seg2 = translation[trans_split_idx:]
|
189 |
+
start_seg1 = seg.start
|
190 |
+
end_seg1 = start_seg2 = seg.start + (seg.end - seg.start)/2
|
191 |
+
end_seg2 = seg.end
|
192 |
+
seg1_dict = {}
|
193 |
+
seg1_dict['text'] = src_seg1
|
194 |
+
seg1_dict['start'] = start_seg1
|
195 |
+
seg1_dict['end'] = end_seg1
|
196 |
+
seg1 = SRT_segment(seg1_dict)
|
197 |
+
seg1.translation = trans_seg1
|
198 |
|
199 |
+
seg2_dict = {}
|
200 |
+
seg2_dict['text'] = src_seg2
|
201 |
+
seg2_dict['start'] = start_seg2
|
202 |
+
seg2_dict['end'] = end_seg2
|
203 |
+
seg2 = SRT_segment(seg2_dict)
|
204 |
+
seg2.translation = trans_seg2
|
205 |
+
|
206 |
+
result_list = []
|
207 |
+
if len(seg1.translation) > threshold:
|
208 |
+
result_list += self.split_seg(seg1, threshold)
|
209 |
+
else:
|
210 |
+
result_list.append(seg1)
|
211 |
+
|
212 |
+
if len(seg2.translation) > threshold:
|
213 |
+
result_list += self.split_seg(seg2, threshold)
|
214 |
+
else:
|
215 |
+
result_list.append(seg2)
|
216 |
+
|
217 |
+
return result_list
|
218 |
+
|
219 |
+
|
220 |
+
def check_len_and_split(self, threshold=30):
|
221 |
# TODO: if sentence length >= threshold, split this segments to two
|
222 |
+
segments = []
|
223 |
+
for seg in self.segments:
|
224 |
+
if len(seg.translation) > threshold:
|
225 |
+
seg_list = self.split_seg(seg, threshold)
|
226 |
+
segments += seg_list
|
227 |
+
else:
|
228 |
+
segments.append(seg)
|
229 |
+
|
230 |
+
self.segments = segments
|
231 |
+
|
232 |
pass
|
233 |
|
234 |
def get_source_only(self):
|
|
|
282 |
# TODO: variety of translation
|
283 |
|
284 |
# load term dictionary
|
285 |
+
with open("./finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
|
286 |
+
term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
|
|
|
287 |
|
288 |
# change term
|
289 |
for seg in self.segments:
|
290 |
+
ready_words = seg.source_text.split(" ")
|
291 |
for i in range(len(ready_words)):
|
292 |
word = ready_words[i]
|
293 |
+
[real_word, pos] = self.get_real_word(word)
|
294 |
+
if real_word in term_enzh_dict:
|
295 |
+
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
296 |
+
else:
|
297 |
+
new_word = word
|
298 |
+
ready_words[i] = new_word
|
299 |
+
seg.source_text = " ".join(ready_words)
|
|
|
|
|
|
|
|
|
|
|
300 |
pass
|
301 |
|
302 |
+
def spell_check_term(self):
|
303 |
+
## known bug: I've will be replaced because i've is not in the dict
|
304 |
+
|
305 |
+
|
306 |
+
import enchant
|
307 |
+
dict = enchant.Dict('en_US')
|
308 |
+
term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
|
309 |
|
310 |
+
for seg in self.segments:
|
311 |
+
ready_words = seg.source_text.split(" ")
|
312 |
+
for i in range(len(ready_words)):
|
313 |
+
word = ready_words[i]
|
314 |
+
[real_word, pos] = self.get_real_word(word)
|
315 |
+
if not dict.check(real_word):
|
316 |
+
suggest = term_spellDict.suggest(real_word)
|
317 |
+
if suggest: # relax spell check
|
318 |
+
new_word = word.replace(word[:pos],suggest[0])
|
319 |
+
else:
|
320 |
+
new_word = word
|
321 |
+
ready_words[i] = new_word
|
322 |
+
seg.source_text = " ".join(ready_words)
|
323 |
+
pass
|
324 |
+
|
325 |
+
def spell_correction(self, word:str, arg:int):
|
326 |
+
try:
|
327 |
+
arg in [0,1]
|
328 |
+
except ValueError:
|
329 |
+
print('only 0 or 1 for argument')
|
330 |
+
|
331 |
+
|
332 |
+
def uncover(word:str):
|
333 |
+
if word[-2:] == ".\n":
|
334 |
+
real_word = word[:-2].lower()
|
335 |
+
n = -2
|
336 |
+
elif word[-1:] in [".", "\n", ",", "!", "?"]:
|
337 |
+
real_word = word[:-1].lower()
|
338 |
+
n = -1
|
339 |
+
else:
|
340 |
+
real_word = word.lower()
|
341 |
+
n = 0
|
342 |
+
return real_word, len(word)+n
|
343 |
+
|
344 |
+
real_word = uncover(word)[0]
|
345 |
+
pos = uncover(word)[1]
|
346 |
+
new_word = word
|
347 |
+
if arg == 0: # term translate mode
|
348 |
+
with open("finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
|
349 |
+
term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
|
350 |
+
if real_word in term_enzh_dict:
|
351 |
+
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
352 |
+
elif arg == 1: # term spell check mode
|
353 |
+
import enchant
|
354 |
+
dict = enchant.Dict('en_US')
|
355 |
+
term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
|
356 |
+
if not dict.check(real_word):
|
357 |
+
if term_spellDict.suggest(real_word): # relax spell check
|
358 |
+
new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
|
359 |
+
return new_word
|
360 |
+
|
361 |
+
def get_real_word(self, word:str):
|
362 |
+
if word[-2:] == ".\n":
|
363 |
+
real_word = word[:-2].lower()
|
364 |
+
n = -2
|
365 |
+
elif word[-1:] in [".", "\n", ",", "!", "?"]:
|
366 |
+
real_word = word[:-1].lower()
|
367 |
+
n = -1
|
368 |
+
else:
|
369 |
+
real_word = word.lower()
|
370 |
+
n = 0
|
371 |
+
return real_word, len(word)+n
|
finetune_data/{dict.csv → dict_enzh.csv}
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
barracks,兵营
|
2 |
engineering bay,工程站
|
3 |
forge,锻炉
|
4 |
blink,闪现
|
|
|
1 |
+
barracks,兵营
|
2 |
engineering bay,工程站
|
3 |
forge,锻炉
|
4 |
blink,闪现
|
finetune_data/dict_freq.csv
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
barracks,1
|
2 |
+
engineering bay,1
|
3 |
+
forge,1
|
4 |
+
blink,1
|
5 |
+
evolution chamber,1
|
6 |
+
cybernetics core,1
|
7 |
+
enhanced shockwaves,1
|
8 |
+
gravitic boosters,1
|
9 |
+
armory,1
|
10 |
+
robotics bay,1
|
11 |
+
twilight council,1
|
12 |
+
fusion core,1
|
13 |
+
fleet beacon,1
|
14 |
+
factory,1
|
15 |
+
ghost academy,1
|
16 |
+
infestation pit,1
|
17 |
+
robotics facility,1
|
18 |
+
stargate,1
|
19 |
+
starport,1
|
20 |
+
archon,1
|
21 |
+
smart servos,1
|
22 |
+
gateway,1
|
23 |
+
warpgate,1
|
24 |
+
immortal,1
|
25 |
+
zealot,1
|
26 |
+
nydus network,1
|
27 |
+
nydus worm,1
|
28 |
+
hydralisk,1
|
29 |
+
grooved spines,1
|
30 |
+
muscular augments,1
|
31 |
+
hydralisk den,1
|
32 |
+
planetary fortress,1
|
33 |
+
battle cruiser,1
|
34 |
+
weapon refit,1
|
35 |
+
brood lord,1
|
36 |
+
greater spire,1
|
37 |
+
anabolic synthesis,1
|
38 |
+
cyclone,1
|
39 |
+
bunker,1
|
40 |
+
lurker,1
|
41 |
+
seismic spines,1
|
42 |
+
adaptive talons,1
|
43 |
+
lurker den,1
|
44 |
+
widow mine,1
|
45 |
+
ground carapace,1
|
46 |
+
high templar,1
|
47 |
+
shield battery,1
|
48 |
+
observer,1
|
49 |
+
baneling,1
|
50 |
+
centrifugal hooks,1
|
51 |
+
baneling nest,1
|
52 |
+
raven,1
|
53 |
+
combat shield,1
|
54 |
+
shield,1
|
55 |
+
lair,1
|
56 |
+
missile turret,1
|
57 |
+
spore crawler,1
|
58 |
+
supply depot,1
|
59 |
+
overlord,1
|
60 |
+
pneumatized carapace,1
|
61 |
+
mutalisk,1
|
62 |
+
spire,1
|
63 |
+
viper,1
|
64 |
+
flyer attacks,1
|
65 |
+
flyer carapace,1
|
66 |
+
tempest,1
|
67 |
+
tectonic destabilizers,1
|
68 |
+
phoenix,1
|
69 |
+
anion pulse-crystals,1
|
70 |
+
corruptor,1
|
71 |
+
infestor,1
|
72 |
+
pathogen glands,1
|
73 |
+
zergling,1
|
74 |
+
spawning pool,1
|
75 |
+
metabolic boost,1
|
76 |
+
spine crawler,1
|
77 |
+
marauder,1
|
78 |
+
ghost,1
|
79 |
+
arm silo with nuke,1
|
80 |
+
carrier,1
|
81 |
+
hellion,1
|
82 |
+
hellbat,1
|
83 |
+
ravager,1
|
84 |
+
nexus,1
|
85 |
+
hatchery,1
|
86 |
+
command center,1
|
87 |
+
neosteel armor,1
|
88 |
+
hi-sec auto tracking,1
|
89 |
+
ship weapons,1
|
90 |
+
charge,1
|
91 |
+
liberator,1
|
92 |
+
advanced ballistics,1
|
93 |
+
melee attacks,1
|
94 |
+
colossus,1
|
95 |
+
extended thermal lance,1
|
96 |
+
creep tumor,1
|
97 |
+
tech lab,1
|
98 |
+
air armor,1
|
99 |
+
air weapons,1
|
100 |
+
adrenal glands,1
|
101 |
+
mule,1
|
102 |
+
infernal pre-igniter,1
|
103 |
+
thor,1
|
104 |
+
warp prism,1
|
105 |
+
gravitic drive,1
|
106 |
+
dragoon,1
|
107 |
+
cocoon,1
|
108 |
+
larva,1
|
109 |
+
mothership,1
|
110 |
+
burrow,1
|
111 |
+
changeling,1
|
112 |
+
ultralisk,1
|
113 |
+
chitinous plating,1
|
114 |
+
ultralisk cavern,1
|
115 |
+
drone,1
|
116 |
+
scv,1
|
117 |
+
queen,1
|
118 |
+
banshee,1
|
119 |
+
hyperflight rotors,1
|
120 |
+
photon cannon,1
|
121 |
+
missile attacks,1
|
122 |
+
assimilator,1
|
123 |
+
extractor,1
|
124 |
+
refinery,1
|
125 |
+
roach,1
|
126 |
+
marine,1
|
127 |
+
sensor tower,1
|
128 |
+
infantry armor,1
|
129 |
+
infantry weapons,1
|
130 |
+
hive,1
|
131 |
+
psionic storm,1
|
132 |
+
templar archives,1
|
133 |
+
sentry,1
|
134 |
+
ground armor,1
|
135 |
+
ground weapons,1
|
136 |
+
adept,1
|
137 |
+
resonating glaives,1
|
138 |
+
reactor,1
|
139 |
+
pylon,1
|
140 |
+
reaper,1
|
141 |
+
drilling claws,1
|
142 |
+
swarm host,1
|
143 |
+
mag-field accelerator,1
|
144 |
+
siege tank,1
|
145 |
+
probe,1
|
146 |
+
corvid reactor,1
|
147 |
+
neural parasite,1
|
148 |
+
viking,1
|
149 |
+
oracle,1
|
150 |
+
broodling,1
|
151 |
+
locust,1
|
152 |
+
mothership core,1
|
153 |
+
orbital command,1
|
154 |
+
stimpack,1
|
155 |
+
void ray,1
|
156 |
+
flux vanes,1
|
157 |
+
overseer,1
|
158 |
+
ignite afterburners,1
|
159 |
+
dark templar,1
|
160 |
+
shadow stride,1
|
161 |
+
dark shrine,1
|
162 |
+
cloaking field,1
|
163 |
+
personal cloaking,1
|
164 |
+
medivac dropship,1
|
165 |
+
vehicle and ship plating,1
|
166 |
+
vehicle weapons,1
|
167 |
+
war hound,1
|
168 |
+
roach warren,1
|
169 |
+
tunneling claws,1
|
170 |
+
glial reconstitution,1
|
171 |
+
concussive shells,1
|
172 |
+
stalker,1
|
173 |
+
disruptor,1
|
174 |
+
zerg,1
|
175 |
+
protross,1
|
176 |
+
terran,1
|
finetune_data/dict_freq.txt
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
barracks
|
2 |
+
engineering bay
|
3 |
+
forge
|
4 |
+
blink
|
5 |
+
evolution chamber
|
6 |
+
cybernetics core
|
7 |
+
enhanced shockwaves
|
8 |
+
gravitic boosters
|
9 |
+
armory
|
10 |
+
robotics bay
|
11 |
+
twilight council
|
12 |
+
fusion core
|
13 |
+
fleet beacon
|
14 |
+
factory
|
15 |
+
ghost academy
|
16 |
+
infestation pit
|
17 |
+
robotics facility
|
18 |
+
stargate
|
19 |
+
starport
|
20 |
+
archon
|
21 |
+
smart servos
|
22 |
+
gateway
|
23 |
+
warpgate
|
24 |
+
immortal
|
25 |
+
zealot
|
26 |
+
nydus network
|
27 |
+
nydus worm
|
28 |
+
hydralisk
|
29 |
+
grooved spines
|
30 |
+
muscular augments
|
31 |
+
hydralisk den
|
32 |
+
planetary fortress
|
33 |
+
battle cruiser
|
34 |
+
weapon refit
|
35 |
+
brood lord
|
36 |
+
greater spire
|
37 |
+
anabolic synthesis
|
38 |
+
cyclone
|
39 |
+
bunker
|
40 |
+
lurker
|
41 |
+
seismic spines
|
42 |
+
adaptive talons
|
43 |
+
lurker den
|
44 |
+
widow mine
|
45 |
+
ground carapace
|
46 |
+
high templar
|
47 |
+
shield battery
|
48 |
+
observer
|
49 |
+
baneling
|
50 |
+
centrifugal hooks
|
51 |
+
baneling nest
|
52 |
+
raven
|
53 |
+
combat shield
|
54 |
+
shield
|
55 |
+
lair
|
56 |
+
missile turret
|
57 |
+
spore crawler
|
58 |
+
supply depot
|
59 |
+
overlord
|
60 |
+
pneumatized carapace
|
61 |
+
mutalisk
|
62 |
+
spire
|
63 |
+
viper
|
64 |
+
flyer attacks
|
65 |
+
flyer carapace
|
66 |
+
tempest
|
67 |
+
tectonic destabilizers
|
68 |
+
phoenix
|
69 |
+
anion pulse-crystals
|
70 |
+
corruptor
|
71 |
+
infestor
|
72 |
+
pathogen glands
|
73 |
+
zergling
|
74 |
+
spawning pool
|
75 |
+
metabolic boost
|
76 |
+
spine crawler
|
77 |
+
marauder
|
78 |
+
ghost
|
79 |
+
arm silo with nuke
|
80 |
+
carrier
|
81 |
+
hellion
|
82 |
+
hellbat
|
83 |
+
ravager
|
84 |
+
nexus
|
85 |
+
hatchery
|
86 |
+
command center
|
87 |
+
neosteel armor
|
88 |
+
hi-sec auto tracking
|
89 |
+
ship weapons
|
90 |
+
charge
|
91 |
+
liberator
|
92 |
+
advanced ballistics
|
93 |
+
melee attacks
|
94 |
+
colossus
|
95 |
+
extended thermal lance
|
96 |
+
creep tumor
|
97 |
+
tech lab
|
98 |
+
air armor
|
99 |
+
air weapons
|
100 |
+
adrenal glands
|
101 |
+
mule
|
102 |
+
infernal pre-igniter
|
103 |
+
thor
|
104 |
+
warp prism
|
105 |
+
gravitic drive
|
106 |
+
dragoon
|
107 |
+
cocoon
|
108 |
+
larva
|
109 |
+
mothership
|
110 |
+
burrow
|
111 |
+
changeling
|
112 |
+
ultralisk
|
113 |
+
chitinous plating
|
114 |
+
ultralisk cavern
|
115 |
+
drone
|
116 |
+
scv
|
117 |
+
queen
|
118 |
+
banshee
|
119 |
+
hyperflight rotors
|
120 |
+
photon cannon
|
121 |
+
missile attacks
|
122 |
+
assimilator
|
123 |
+
extractor
|
124 |
+
refinery
|
125 |
+
roach
|
126 |
+
marine
|
127 |
+
sensor tower
|
128 |
+
infantry armor
|
129 |
+
infantry weapons
|
130 |
+
hive
|
131 |
+
psionic storm
|
132 |
+
templar archives
|
133 |
+
sentry
|
134 |
+
ground armor
|
135 |
+
ground weapons
|
136 |
+
adept
|
137 |
+
resonating glaives
|
138 |
+
reactor
|
139 |
+
pylon
|
140 |
+
reaper
|
141 |
+
drilling claws
|
142 |
+
swarm host
|
143 |
+
mag-field accelerator
|
144 |
+
siege tank
|
145 |
+
probe
|
146 |
+
corvid reactor
|
147 |
+
neural parasite
|
148 |
+
viking
|
149 |
+
oracle
|
150 |
+
broodling
|
151 |
+
locust
|
152 |
+
mothership core
|
153 |
+
orbital command
|
154 |
+
stimpack
|
155 |
+
void ray
|
156 |
+
flux vanes
|
157 |
+
overseer
|
158 |
+
ignite afterburners
|
159 |
+
dark templar
|
160 |
+
shadow stride
|
161 |
+
dark shrine
|
162 |
+
cloaking field
|
163 |
+
personal cloaking
|
164 |
+
medivac dropship
|
165 |
+
vehicle and ship plating
|
166 |
+
vehicle weapons
|
167 |
+
war hound
|
168 |
+
roach warren
|
169 |
+
tunneling claws
|
170 |
+
glial reconstitution
|
171 |
+
concussive shells
|
172 |
+
stalker
|
173 |
+
disruptor
|
174 |
+
zerg
|
175 |
+
protross
|
176 |
+
terran
|
177 |
+
starcraft
|
pipeline.py
CHANGED
@@ -2,7 +2,6 @@ import openai
|
|
2 |
from pytube import YouTube
|
3 |
import argparse
|
4 |
import os
|
5 |
-
import whisper
|
6 |
from tqdm import tqdm
|
7 |
from SRT import SRT_script
|
8 |
import stable_whisper
|
@@ -126,6 +125,7 @@ else:
|
|
126 |
|
127 |
# srt preprocess
|
128 |
srt.form_whole_sentence()
|
|
|
129 |
srt.correct_with_force_term()
|
130 |
srt.write_srt_file_src(srt_file_en)
|
131 |
script_input = srt.get_source_only()
|
@@ -198,7 +198,9 @@ for s, range in tqdm(zip(script_arr, range_arr)):
|
|
198 |
|
199 |
srt.set_translation(translate, range)
|
200 |
|
|
|
201 |
srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
|
|
|
202 |
|
203 |
if not args.only_srt:
|
204 |
assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
|
|
|
2 |
from pytube import YouTube
|
3 |
import argparse
|
4 |
import os
|
|
|
5 |
from tqdm import tqdm
|
6 |
from SRT import SRT_script
|
7 |
import stable_whisper
|
|
|
125 |
|
126 |
# srt preprocess
|
127 |
srt.form_whole_sentence()
|
128 |
+
srt.spell_check_term()
|
129 |
srt.correct_with_force_term()
|
130 |
srt.write_srt_file_src(srt_file_en)
|
131 |
script_input = srt.get_source_only()
|
|
|
198 |
|
199 |
srt.set_translation(translate, range)
|
200 |
|
201 |
+
srt.check_len_and_split()
|
202 |
srt.write_srt_file_translate(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt")
|
203 |
+
srt.write_srt_file_bilingual(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt")
|
204 |
|
205 |
if not args.only_srt:
|
206 |
assSub_zh = srt2ass(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt", "default", "No", "Modest")
|