Spaces:
Sleeping
Sleeping
Merge branch 'eason/main' into Macrodove/suggestionFunc
Browse files- SRT.py +4 -4
- dict_util.py +8 -10
- finetune_data/dict_enzh.csv +15 -1
- finetune_data/dict_freq.txt +6 -6
- finetune_data/test.csv +193 -0
- pipeline.py +9 -3
SRT.py
CHANGED
@@ -402,7 +402,7 @@ class SRT_script():
|
|
402 |
for word in keywords:
|
403 |
for i, seg in enumerate(self.segments):
|
404 |
if word in seg.source_text.lower():
|
405 |
-
seg.source_text =
|
406 |
logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
|
407 |
logging.info("source text becomes: " + seg.source_text)
|
408 |
|
@@ -436,13 +436,13 @@ class SRT_script():
|
|
436 |
for i in range(len(ready_words)):
|
437 |
word = ready_words[i]
|
438 |
[real_word, pos] = self.get_real_word(word)
|
439 |
-
if not dict.check(word[:pos]):
|
440 |
new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
|
441 |
|
442 |
-
logging.info(
|
443 |
|
444 |
#suggest = term_spellDict.suggest(real_word)
|
445 |
-
#if suggest and enchant.utils.levenshtein(
|
446 |
|
447 |
# with open("dislog.log","a") as log:
|
448 |
# if not os.path.exists("dislog.log"):
|
|
|
402 |
for word in keywords:
|
403 |
for i, seg in enumerate(self.segments):
|
404 |
if word in seg.source_text.lower():
|
405 |
+
seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(term_enzh_dict.get(word)), seg.source_text, flags=re.IGNORECASE)
|
406 |
logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
|
407 |
logging.info("source text becomes: " + seg.source_text)
|
408 |
|
|
|
436 |
for i in range(len(ready_words)):
|
437 |
word = ready_words[i]
|
438 |
[real_word, pos] = self.get_real_word(word)
|
439 |
+
if not dict.check(word[:pos]) and not term_spellDict.check(real_word):
|
440 |
new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
|
441 |
|
442 |
+
logging.info(real_word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(real_word, self.fetchfunc(word[:pos],0.5)))+'\n')
|
443 |
|
444 |
#suggest = term_spellDict.suggest(real_word)
|
445 |
+
#if suggest and enchant.utils.levenshtein(real_word, suggest[0]) < (len(real_word)+len(suggest[0]))/4: # relax spell check
|
446 |
|
447 |
# with open("dislog.log","a") as log:
|
448 |
# if not os.path.exists("dislog.log"):
|
dict_util.py
CHANGED
@@ -7,11 +7,12 @@ import pickle
|
|
7 |
# 1_2_3, 1 is action, 2 is supply object, 3 is accept object
|
8 |
def update_dict_csv(term_dict, f):
|
9 |
for rows in csv.reader(f):
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
13 |
else:
|
14 |
-
term_dict[
|
15 |
pass
|
16 |
|
17 |
def export_dict_csv(term_dict, f):
|
@@ -21,17 +22,14 @@ def export_dict_csv(term_dict, f):
|
|
21 |
|
22 |
def save_dict_pickle(term_dict, f):
|
23 |
pickle.dump(term_dict, f, pickle.HIGHEST_PROTOCOL)
|
|
|
24 |
|
25 |
def update_csv_pickle(pickle_f, csv_f):
|
26 |
term_dict = pickle.load(pickle_f)
|
27 |
-
|
28 |
-
if rows[0] in term_dict:
|
29 |
-
if rows[1] not in term_dict[rows[0]]:
|
30 |
-
term_dict[rows[0]] = term_dict[rows[0]]+[rows[1]]
|
31 |
-
else:
|
32 |
-
term_dict[rows[0]]=[rows[1]]
|
33 |
#save to pickle file, highest protocal to get better performance
|
34 |
pickle.dump(term_dict, pickle_f, pickle.HIGHEST_PROTOCOL)
|
|
|
35 |
|
36 |
|
37 |
#demo
|
|
|
7 |
# 1_2_3, 1 is action, 2 is supply object, 3 is accept object
|
8 |
def update_dict_csv(term_dict, f):
|
9 |
for rows in csv.reader(f):
|
10 |
+
word = rows[0].lower()
|
11 |
+
if word in term_dict:
|
12 |
+
if rows[1] not in term_dict[word]:
|
13 |
+
term_dict[word] = term_dict[word]+[rows[1]]
|
14 |
else:
|
15 |
+
term_dict[word]=[rows[1]]
|
16 |
pass
|
17 |
|
18 |
def export_dict_csv(term_dict, f):
|
|
|
22 |
|
23 |
def save_dict_pickle(term_dict, f):
|
24 |
pickle.dump(term_dict, f, pickle.HIGHEST_PROTOCOL)
|
25 |
+
pass
|
26 |
|
27 |
def update_csv_pickle(pickle_f, csv_f):
|
28 |
term_dict = pickle.load(pickle_f)
|
29 |
+
update_dict_csv(term_dict, csv_f)
|
|
|
|
|
|
|
|
|
|
|
30 |
#save to pickle file, highest protocal to get better performance
|
31 |
pickle.dump(term_dict, pickle_f, pickle.HIGHEST_PROTOCOL)
|
32 |
+
pass
|
33 |
|
34 |
|
35 |
#demo
|
finetune_data/dict_enzh.csv
CHANGED
@@ -179,4 +179,18 @@ stalker,追猎
|
|
179 |
disruptor,自爆球
|
180 |
zerg,虫族
|
181 |
protross,神族
|
182 |
-
terran,人族
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
disruptor,自爆球
|
180 |
zerg,虫族
|
181 |
protross,神族
|
182 |
+
terran,人族
|
183 |
+
gas,气矿
|
184 |
+
cannon rush,野炮台
|
185 |
+
supply block,卡人口
|
186 |
+
macro,运营
|
187 |
+
natural expansion,开矿
|
188 |
+
roach warren,蟑螂虫巢
|
189 |
+
proxy,野
|
190 |
+
cyber core,控制芯核
|
191 |
+
prismatic alignment,充能射线
|
192 |
+
flooding,余钱
|
193 |
+
chrono boost,星空加速
|
194 |
+
cheese,狗
|
195 |
+
supply,人口
|
196 |
+
war prism,棱镜
|
finetune_data/dict_freq.txt
CHANGED
@@ -23,8 +23,8 @@ gateway
|
|
23 |
warpgate
|
24 |
immortal
|
25 |
zealot
|
26 |
-
nydus
|
27 |
-
|
28 |
hydralisk
|
29 |
grooved spines
|
30 |
muscular augments
|
@@ -173,12 +173,12 @@ concussive shells
|
|
173 |
stalker
|
174 |
disruptor
|
175 |
zerg
|
176 |
-
|
177 |
terran
|
178 |
starcraft
|
179 |
TvT
|
180 |
Maxpax
|
181 |
-
|
182 |
PvP
|
183 |
ZvZ
|
184 |
TvZ
|
@@ -187,7 +187,7 @@ ZvP
|
|
187 |
PvZ
|
188 |
PvT
|
189 |
ZvT
|
190 |
-
Florencio
|
191 |
cybercore
|
192 |
nest
|
193 |
-
follow-up
|
|
|
|
23 |
warpgate
|
24 |
immortal
|
25 |
zealot
|
26 |
+
nydus
|
27 |
+
worm
|
28 |
hydralisk
|
29 |
grooved spines
|
30 |
muscular augments
|
|
|
173 |
stalker
|
174 |
disruptor
|
175 |
zerg
|
176 |
+
protoss
|
177 |
terran
|
178 |
starcraft
|
179 |
TvT
|
180 |
Maxpax
|
181 |
+
ShowTime
|
182 |
PvP
|
183 |
ZvZ
|
184 |
TvZ
|
|
|
187 |
PvZ
|
188 |
PvT
|
189 |
ZvT
|
|
|
190 |
cybercore
|
191 |
nest
|
192 |
+
follow-up
|
193 |
+
robo
|
finetune_data/test.csv
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
barracks,['兵营']
|
2 |
+
zerg,['虫族']
|
3 |
+
protoss,['神族']
|
4 |
+
terran,['人族']
|
5 |
+
engineering bay,['工程站']
|
6 |
+
forge,['锻炉']
|
7 |
+
blink,['闪现']
|
8 |
+
evolution chamber,['进化腔']
|
9 |
+
cybernetics core,['控制芯核']
|
10 |
+
cybercore,['控制芯核']
|
11 |
+
enhanced shockwaves,['EMP范围']
|
12 |
+
gravitic boosters,['ob速度']
|
13 |
+
armory,['军械库']
|
14 |
+
robotics bay,['机械研究所']
|
15 |
+
twilight council,['光影议会']
|
16 |
+
fusion core,['聚变芯体']
|
17 |
+
fleet beacon,['舰队航标']
|
18 |
+
factory,['重工厂']
|
19 |
+
ghost academy,['幽灵军校']
|
20 |
+
infestation pit,['感染深渊']
|
21 |
+
robotics facility,['机械台']
|
22 |
+
stargate,['星门']
|
23 |
+
starport,['星港']
|
24 |
+
archon,['白球']
|
25 |
+
smart servos,['变形加速']
|
26 |
+
gateway,['兵营']
|
27 |
+
warpgate,['兵营']
|
28 |
+
immortal,['不朽']
|
29 |
+
zealot,['叉叉']
|
30 |
+
nydus network,['虫洞']
|
31 |
+
nydus worm,['虫洞']
|
32 |
+
hydralisk,['刺蛇']
|
33 |
+
grooved spines,['刺蛇射程']
|
34 |
+
muscular augments,['刺蛇速度']
|
35 |
+
hydralisk den,['刺蛇塔']
|
36 |
+
planetary fortress,['大地堡']
|
37 |
+
battle cruiser,['大和']
|
38 |
+
weapon refit,['大和炮']
|
39 |
+
brood lord,['大龙']
|
40 |
+
broodling,"['巢虫', '小虫子']"
|
41 |
+
greater spire,['大龙塔']
|
42 |
+
anabolic synthesis,['大牛速度']
|
43 |
+
cyclone,['导弹车']
|
44 |
+
bunker,['地堡']
|
45 |
+
lurker,['地刺']
|
46 |
+
seismic spines,['地刺射程']
|
47 |
+
adaptive talons,['地刺速埋']
|
48 |
+
lurker den,['地刺塔']
|
49 |
+
widow mine,['地雷']
|
50 |
+
ground carapace,['地面单位甲壳等级']
|
51 |
+
high templar,['电兵']
|
52 |
+
shield battery,['电池']
|
53 |
+
observer,"['叮当', 'OB']"
|
54 |
+
baneling,['毒爆']
|
55 |
+
centrifugal hooks,['毒爆速度']
|
56 |
+
baneling nest,['毒爆塔']
|
57 |
+
raven,['渡鸦']
|
58 |
+
combat shield,['盾']
|
59 |
+
shield,['盾']
|
60 |
+
lair,['二本']
|
61 |
+
missile turret,['防空']
|
62 |
+
spore crawler,['防空']
|
63 |
+
supply depot,['房子']
|
64 |
+
overlord,['房子']
|
65 |
+
pneumatized carapace,['房子速度']
|
66 |
+
mutalisk,['飞龙']
|
67 |
+
spire,['飞龙塔']
|
68 |
+
viper,['飞蛇']
|
69 |
+
flyer attacks,['飞行生物攻击等级']
|
70 |
+
flyer carapace,['飞行生物甲壳等级']
|
71 |
+
tempest,['风暴']
|
72 |
+
tectonic destabilizers,['风暴伤害']
|
73 |
+
phoenix,['凤凰']
|
74 |
+
anion pulse-crystals,['凤凰射程']
|
75 |
+
corruptor,['腐化']
|
76 |
+
infestor,['感染虫']
|
77 |
+
pathogen glands,['感染能量']
|
78 |
+
zergling,['狗']
|
79 |
+
spawning pool,['狗池']
|
80 |
+
metabolic boost,['狗速']
|
81 |
+
spine crawler,['管子']
|
82 |
+
marauder,['光头']
|
83 |
+
ghost,['鬼兵']
|
84 |
+
arm silo with nuke,['核弹']
|
85 |
+
carrier,['航母']
|
86 |
+
hellion,['火车']
|
87 |
+
hellbat,['火车侠']
|
88 |
+
ravager,['火蟑螂']
|
89 |
+
nexus,['基地']
|
90 |
+
hatchery,['基地']
|
91 |
+
command center,['基地']
|
92 |
+
neosteel armor,['建筑护甲']
|
93 |
+
hi-sec auto tracking,['建筑射程']
|
94 |
+
ship weapons,['舰船武器等级']
|
95 |
+
charge,['脚速']
|
96 |
+
liberator,['解放']
|
97 |
+
advanced ballistics,['解放射程']
|
98 |
+
melee attacks,['近战攻击等级']
|
99 |
+
colossus,['巨像']
|
100 |
+
extended thermal lance,['巨像射程']
|
101 |
+
creep tumor,['菌毯']
|
102 |
+
tech lab,['科技挂件']
|
103 |
+
air armor,['空中单位护甲等级']
|
104 |
+
air weapons,['空中单位武器等级']
|
105 |
+
adrenal glands,['狂狗']
|
106 |
+
mule,['矿螺']
|
107 |
+
infernal pre-igniter,['蓝火']
|
108 |
+
thor,['雷神']
|
109 |
+
warp prism,['棱镜']
|
110 |
+
gravitic drive,['棱镜速度']
|
111 |
+
dragoon,['龙骑士']
|
112 |
+
cocoon,['卵']
|
113 |
+
larva,['卵']
|
114 |
+
mothership,['妈妈船']
|
115 |
+
burrow,['埋地']
|
116 |
+
changeling,"['拟态虫', '小虫子']"
|
117 |
+
ultralisk,['牛']
|
118 |
+
chitinous plating,['牛甲']
|
119 |
+
ultralisk cavern,['牛塔']
|
120 |
+
drone,['农民']
|
121 |
+
scv,['农民']
|
122 |
+
queen,['女王']
|
123 |
+
banshee,['女妖']
|
124 |
+
hyperflight rotors,['女妖提速']
|
125 |
+
photon cannon,['炮台']
|
126 |
+
cannon,[' 炮台']
|
127 |
+
missile attacks,['喷射攻击等级']
|
128 |
+
assimilator,['气矿']
|
129 |
+
extractor,['气矿']
|
130 |
+
refinery,['气矿']
|
131 |
+
roach,['钱赞企']
|
132 |
+
marine,['枪兵']
|
133 |
+
sensor tower,['圈']
|
134 |
+
infantry armor,['人族防']
|
135 |
+
infantry weapons,['人族攻']
|
136 |
+
hive,['三本']
|
137 |
+
psionic storm,['闪电']
|
138 |
+
templar archives,['闪电塔']
|
139 |
+
sentry,['哨兵']
|
140 |
+
ground armor,['神族防']
|
141 |
+
ground weapons,['神族攻']
|
142 |
+
adept,['使徒']
|
143 |
+
resonating glaives,['使徒攻速']
|
144 |
+
reactor,['双倍挂件']
|
145 |
+
pylon,['水晶']
|
146 |
+
reaper,['死神']
|
147 |
+
drilling claws,['速埋']
|
148 |
+
swarm host,['宿主']
|
149 |
+
mag-field accelerator,['锁定增伤']
|
150 |
+
siege tank,['坦克']
|
151 |
+
probe,['探机']
|
152 |
+
corvid reactor,['铁鸦能量']
|
153 |
+
neural parasite,['同化完成']
|
154 |
+
viking,['维京']
|
155 |
+
oracle,['先知']
|
156 |
+
locust,['小虫子']
|
157 |
+
mothership core,['小妈妈船']
|
158 |
+
orbital command,['星轨']
|
159 |
+
stimpack,['兴奋剂']
|
160 |
+
void ray,['虚空']
|
161 |
+
flux vanes,['虚空速度']
|
162 |
+
overseer,['眼虫']
|
163 |
+
ignite afterburners,['医疗机速度']
|
164 |
+
dark templar,['隐刀']
|
165 |
+
shadow stride,['隐刀闪现']
|
166 |
+
dark shrine,['隐刀塔']
|
167 |
+
cloaking field,['隐形']
|
168 |
+
personal cloaking,['隐形']
|
169 |
+
medivac dropship,['运输机']
|
170 |
+
vehicle and ship plating,['战车及舰船钢板等级']
|
171 |
+
vehicle weapons,['战车武器等级']
|
172 |
+
war hound,['战狼']
|
173 |
+
roach warren,['蟑螂巢']
|
174 |
+
tunneling claws,['蟑螂埋地']
|
175 |
+
glial reconstitution,['蟑螂速度']
|
176 |
+
concussive shells,['震撼弹']
|
177 |
+
stalker,['追猎']
|
178 |
+
disruptor,['自爆球']
|
179 |
+
protross,['神族']
|
180 |
+
gas,['气矿']
|
181 |
+
hatch,['基地']
|
182 |
+
supply,['人口']
|
183 |
+
macro,['运营']
|
184 |
+
natural,['二矿']
|
185 |
+
proxy,['前置']
|
186 |
+
kite,['甩']
|
187 |
+
cyber core,['控制芯核']
|
188 |
+
saturate,['满采']
|
189 |
+
nydusworm,['虫洞']
|
190 |
+
float,['余钱']
|
191 |
+
transfuse,['加血']
|
192 |
+
chrono boost,['加速']
|
193 |
+
war prism,['棱镜']
|
pipeline.py
CHANGED
@@ -9,6 +9,7 @@ import whisper
|
|
9 |
from srt2ass import srt2ass
|
10 |
import logging
|
11 |
from datetime import datetime
|
|
|
12 |
|
13 |
import subprocess
|
14 |
|
@@ -109,7 +110,10 @@ def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file =
|
|
109 |
|
110 |
# use stable-whisper
|
111 |
elif method == "stable":
|
112 |
-
|
|
|
|
|
|
|
113 |
transcript = model.transcribe(audio_path, regroup = False, initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
|
114 |
(
|
115 |
transcript
|
@@ -265,7 +269,9 @@ def main():
|
|
265 |
|
266 |
audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
|
267 |
|
268 |
-
|
|
|
|
|
269 |
logging.info("---------------------Video Info---------------------")
|
270 |
logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
|
271 |
|
@@ -313,7 +319,7 @@ def main():
|
|
313 |
os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
|
314 |
|
315 |
end_time = time.time()
|
316 |
-
logging.info("Pipeline finished, time duration:{}".format(
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
main()
|
|
|
9 |
from srt2ass import srt2ass
|
10 |
import logging
|
11 |
from datetime import datetime
|
12 |
+
import torch
|
13 |
|
14 |
import subprocess
|
15 |
|
|
|
110 |
|
111 |
# use stable-whisper
|
112 |
elif method == "stable":
|
113 |
+
|
114 |
+
# use cuda if available
|
115 |
+
devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
116 |
+
model = stable_whisper.load_model(whisper_model, device = devices)
|
117 |
transcript = model.transcribe(audio_path, regroup = False, initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
|
118 |
(
|
119 |
transcript
|
|
|
269 |
|
270 |
audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
|
271 |
|
272 |
+
if not os.path.exists(args.log_dir):
|
273 |
+
os.makedirs(args.log_dir)
|
274 |
+
logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler("{}/{}_{}.log".format(args.log_dir, VIDEO_NAME, datetime.now().strftime("%m%d%Y_%H%M%S")), 'w', encoding='utf-8')])
|
275 |
logging.info("---------------------Video Info---------------------")
|
276 |
logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
|
277 |
|
|
|
319 |
os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
|
320 |
|
321 |
end_time = time.time()
|
322 |
+
logging.info("Pipeline finished, time duration:{}".format(time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))))
|
323 |
|
324 |
if __name__ == "__main__":
|
325 |
main()
|