yichenl5 commited on
Commit
906c785
2 Parent(s): 31aaedc 0a976db

Merge branch 'eason/main' into Macrodove/suggestionFunc

Browse files
SRT.py CHANGED
@@ -402,7 +402,7 @@ class SRT_script():
402
  for word in keywords:
403
  for i, seg in enumerate(self.segments):
404
  if word in seg.source_text.lower():
405
- seg.source_text = seg.source_text.lower().replace(word, term_enzh_dict.get(word))
406
  logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
407
  logging.info("source text becomes: " + seg.source_text)
408
 
@@ -436,13 +436,13 @@ class SRT_script():
436
  for i in range(len(ready_words)):
437
  word = ready_words[i]
438
  [real_word, pos] = self.get_real_word(word)
439
- if not dict.check(word[:pos]):
440
  new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
441
 
442
- logging.info(word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(word, self.fetchfunc(word[:pos],0.5)))+'\n')
443
 
444
  #suggest = term_spellDict.suggest(real_word)
445
- #if suggest and enchant.utils.levenshtein(word, suggest[0]) < (len(word)+len(suggest[0]))/4: # relax spell check
446
 
447
  # with open("dislog.log","a") as log:
448
  # if not os.path.exists("dislog.log"):
 
402
  for word in keywords:
403
  for i, seg in enumerate(self.segments):
404
  if word in seg.source_text.lower():
405
+ seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(term_enzh_dict.get(word)), seg.source_text, flags=re.IGNORECASE)
406
  logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
407
  logging.info("source text becomes: " + seg.source_text)
408
 
 
436
  for i in range(len(ready_words)):
437
  word = ready_words[i]
438
  [real_word, pos] = self.get_real_word(word)
439
+ if not dict.check(word[:pos]) and not term_spellDict.check(real_word):
440
  new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
441
 
442
+ logging.info(real_word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(real_word, self.fetchfunc(word[:pos],0.5)))+'\n')
443
 
444
  #suggest = term_spellDict.suggest(real_word)
445
+ #if suggest and enchant.utils.levenshtein(real_word, suggest[0]) < (len(real_word)+len(suggest[0]))/4: # relax spell check
446
 
447
  # with open("dislog.log","a") as log:
448
  # if not os.path.exists("dislog.log"):
dict_util.py CHANGED
@@ -7,11 +7,12 @@ import pickle
7
  # 1_2_3, 1 is action, 2 is supply object, 3 is accept object
8
  def update_dict_csv(term_dict, f):
9
  for rows in csv.reader(f):
10
- if rows[0] in term_dict:
11
- if rows[1] not in term_dict[rows[0]]:
12
- term_dict[rows[0]] = term_dict[rows[0]]+[rows[1]]
 
13
  else:
14
- term_dict[rows[0]]=[rows[1]]
15
  pass
16
 
17
  def export_dict_csv(term_dict, f):
@@ -21,17 +22,14 @@ def export_dict_csv(term_dict, f):
21
 
22
  def save_dict_pickle(term_dict, f):
23
  pickle.dump(term_dict, f, pickle.HIGHEST_PROTOCOL)
 
24
 
25
  def update_csv_pickle(pickle_f, csv_f):
26
  term_dict = pickle.load(pickle_f)
27
- for rows in csv.reader(csv_f):
28
- if rows[0] in term_dict:
29
- if rows[1] not in term_dict[rows[0]]:
30
- term_dict[rows[0]] = term_dict[rows[0]]+[rows[1]]
31
- else:
32
- term_dict[rows[0]]=[rows[1]]
33
  #save to pickle file, highest protocal to get better performance
34
  pickle.dump(term_dict, pickle_f, pickle.HIGHEST_PROTOCOL)
 
35
 
36
 
37
  #demo
 
7
  # 1_2_3, 1 is action, 2 is supply object, 3 is accept object
8
  def update_dict_csv(term_dict, f):
9
  for rows in csv.reader(f):
10
+ word = rows[0].lower()
11
+ if word in term_dict:
12
+ if rows[1] not in term_dict[word]:
13
+ term_dict[word] = term_dict[word]+[rows[1]]
14
  else:
15
+ term_dict[word]=[rows[1]]
16
  pass
17
 
18
  def export_dict_csv(term_dict, f):
 
22
 
23
  def save_dict_pickle(term_dict, f):
24
  pickle.dump(term_dict, f, pickle.HIGHEST_PROTOCOL)
25
+ pass
26
 
27
  def update_csv_pickle(pickle_f, csv_f):
28
  term_dict = pickle.load(pickle_f)
29
+ update_dict_csv(term_dict, csv_f)
 
 
 
 
 
30
  #save to pickle file, highest protocal to get better performance
31
  pickle.dump(term_dict, pickle_f, pickle.HIGHEST_PROTOCOL)
32
+ pass
33
 
34
 
35
  #demo
finetune_data/dict_enzh.csv CHANGED
@@ -179,4 +179,18 @@ stalker,追猎
179
  disruptor,自爆球
180
  zerg,虫族
181
  protross,神族
182
- terran,人族
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  disruptor,自爆球
180
  zerg,虫族
181
  protross,神族
182
+ terran,人族
183
+ gas,气矿
184
+ cannon rush,野炮台
185
+ supply block,卡人口
186
+ macro,运营
187
+ natural expansion,开矿
188
+ roach warren,蟑螂虫巢
189
+ proxy,野
190
+ cyber core,控制芯核
191
+ prismatic alignment,充能射线
192
+ flooding,余钱
193
+ chrono boost,星空加速
194
+ cheese,狗
195
+ supply,人口
196
+ war prism,棱镜
finetune_data/dict_freq.txt CHANGED
@@ -23,8 +23,8 @@ gateway
23
  warpgate
24
  immortal
25
  zealot
26
- nydus network
27
- nydus worm
28
  hydralisk
29
  grooved spines
30
  muscular augments
@@ -173,12 +173,12 @@ concussive shells
173
  stalker
174
  disruptor
175
  zerg
176
- protross
177
  terran
178
  starcraft
179
  TvT
180
  Maxpax
181
- showtime
182
  PvP
183
  ZvZ
184
  TvZ
@@ -187,7 +187,7 @@ ZvP
187
  PvZ
188
  PvT
189
  ZvT
190
- Florencio
191
  cybercore
192
  nest
193
- follow-up
 
 
23
  warpgate
24
  immortal
25
  zealot
26
+ nydus
27
+ worm
28
  hydralisk
29
  grooved spines
30
  muscular augments
 
173
  stalker
174
  disruptor
175
  zerg
176
+ protoss
177
  terran
178
  starcraft
179
  TvT
180
  Maxpax
181
+ ShowTime
182
  PvP
183
  ZvZ
184
  TvZ
 
187
  PvZ
188
  PvT
189
  ZvT
 
190
  cybercore
191
  nest
192
+ follow-up
193
+ robo
finetune_data/test.csv ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks,['兵营']
2
+ zerg,['虫族']
3
+ protoss,['神族']
4
+ terran,['人族']
5
+ engineering bay,['工程站']
6
+ forge,['锻炉']
7
+ blink,['闪现']
8
+ evolution chamber,['进化腔']
9
+ cybernetics core,['控制芯核']
10
+ cybercore,['控制芯核']
11
+ enhanced shockwaves,['EMP范围']
12
+ gravitic boosters,['ob速度']
13
+ armory,['军械库']
14
+ robotics bay,['机械研究所']
15
+ twilight council,['光影议会']
16
+ fusion core,['聚变芯体']
17
+ fleet beacon,['舰队航标']
18
+ factory,['重工厂']
19
+ ghost academy,['幽灵军校']
20
+ infestation pit,['感染深渊']
21
+ robotics facility,['机械台']
22
+ stargate,['星门']
23
+ starport,['星港']
24
+ archon,['白球']
25
+ smart servos,['变形加速']
26
+ gateway,['兵营']
27
+ warpgate,['兵营']
28
+ immortal,['不朽']
29
+ zealot,['叉叉']
30
+ nydus network,['虫洞']
31
+ nydus worm,['虫洞']
32
+ hydralisk,['刺蛇']
33
+ grooved spines,['刺蛇射程']
34
+ muscular augments,['刺蛇速度']
35
+ hydralisk den,['刺蛇塔']
36
+ planetary fortress,['大地堡']
37
+ battle cruiser,['大和']
38
+ weapon refit,['大和炮']
39
+ brood lord,['大龙']
40
+ broodling,"['巢虫', '小虫子']"
41
+ greater spire,['大龙塔']
42
+ anabolic synthesis,['大牛速度']
43
+ cyclone,['导弹车']
44
+ bunker,['地堡']
45
+ lurker,['地刺']
46
+ seismic spines,['地刺射程']
47
+ adaptive talons,['地刺速埋']
48
+ lurker den,['地刺塔']
49
+ widow mine,['地雷']
50
+ ground carapace,['地面单位甲壳等级']
51
+ high templar,['电兵']
52
+ shield battery,['电池']
53
+ observer,"['叮当', 'OB']"
54
+ baneling,['毒爆']
55
+ centrifugal hooks,['毒爆速度']
56
+ baneling nest,['毒爆塔']
57
+ raven,['渡鸦']
58
+ combat shield,['盾']
59
+ shield,['盾']
60
+ lair,['二本']
61
+ missile turret,['防空']
62
+ spore crawler,['防空']
63
+ supply depot,['房子']
64
+ overlord,['房子']
65
+ pneumatized carapace,['房子速度']
66
+ mutalisk,['飞龙']
67
+ spire,['飞龙塔']
68
+ viper,['飞蛇']
69
+ flyer attacks,['飞行生物攻击等级']
70
+ flyer carapace,['飞行生物甲壳等级']
71
+ tempest,['风暴']
72
+ tectonic destabilizers,['风暴伤害']
73
+ phoenix,['凤凰']
74
+ anion pulse-crystals,['凤凰射程']
75
+ corruptor,['腐化']
76
+ infestor,['感染虫']
77
+ pathogen glands,['感染能量']
78
+ zergling,['狗']
79
+ spawning pool,['狗池']
80
+ metabolic boost,['狗速']
81
+ spine crawler,['管子']
82
+ marauder,['光头']
83
+ ghost,['鬼兵']
84
+ arm silo with nuke,['核弹']
85
+ carrier,['航母']
86
+ hellion,['火车']
87
+ hellbat,['火车侠']
88
+ ravager,['火蟑螂']
89
+ nexus,['基地']
90
+ hatchery,['基地']
91
+ command center,['基地']
92
+ neosteel armor,['建筑护甲']
93
+ hi-sec auto tracking,['建筑射程']
94
+ ship weapons,['舰船武器等级']
95
+ charge,['脚速']
96
+ liberator,['解放']
97
+ advanced ballistics,['解放射程']
98
+ melee attacks,['近战攻击等级']
99
+ colossus,['巨像']
100
+ extended thermal lance,['巨像射程']
101
+ creep tumor,['菌毯']
102
+ tech lab,['科技挂件']
103
+ air armor,['空中单位护甲等级']
104
+ air weapons,['空中单位武器等级']
105
+ adrenal glands,['狂狗']
106
+ mule,['矿螺']
107
+ infernal pre-igniter,['蓝火']
108
+ thor,['雷神']
109
+ warp prism,['棱镜']
110
+ gravitic drive,['棱镜速度']
111
+ dragoon,['龙骑士']
112
+ cocoon,['卵']
113
+ larva,['卵']
114
+ mothership,['妈妈船']
115
+ burrow,['埋地']
116
+ changeling,"['拟态虫', '小虫子']"
117
+ ultralisk,['牛']
118
+ chitinous plating,['牛甲']
119
+ ultralisk cavern,['牛塔']
120
+ drone,['农民']
121
+ scv,['农民']
122
+ queen,['女王']
123
+ banshee,['女妖']
124
+ hyperflight rotors,['女妖提速']
125
+ photon cannon,['炮台']
126
+ cannon,[' 炮台']
127
+ missile attacks,['喷射攻击等级']
128
+ assimilator,['气矿']
129
+ extractor,['气矿']
130
+ refinery,['气矿']
131
+ roach,['钱赞企']
132
+ marine,['枪兵']
133
+ sensor tower,['圈']
134
+ infantry armor,['人族防']
135
+ infantry weapons,['人族攻']
136
+ hive,['三本']
137
+ psionic storm,['闪电']
138
+ templar archives,['闪电塔']
139
+ sentry,['哨兵']
140
+ ground armor,['神族防']
141
+ ground weapons,['神族攻']
142
+ adept,['使徒']
143
+ resonating glaives,['使徒攻速']
144
+ reactor,['双倍挂件']
145
+ pylon,['水晶']
146
+ reaper,['死神']
147
+ drilling claws,['速埋']
148
+ swarm host,['宿主']
149
+ mag-field accelerator,['锁定增伤']
150
+ siege tank,['坦克']
151
+ probe,['探机']
152
+ corvid reactor,['铁鸦能量']
153
+ neural parasite,['同化完成']
154
+ viking,['维京']
155
+ oracle,['先知']
156
+ locust,['小虫子']
157
+ mothership core,['小妈妈船']
158
+ orbital command,['星轨']
159
+ stimpack,['兴奋剂']
160
+ void ray,['虚空']
161
+ flux vanes,['虚空速度']
162
+ overseer,['眼虫']
163
+ ignite afterburners,['医疗机速度']
164
+ dark templar,['隐刀']
165
+ shadow stride,['隐刀闪现']
166
+ dark shrine,['隐刀塔']
167
+ cloaking field,['隐形']
168
+ personal cloaking,['隐形']
169
+ medivac dropship,['运输机']
170
+ vehicle and ship plating,['战车及舰船钢板等级']
171
+ vehicle weapons,['战车武器等级']
172
+ war hound,['战狼']
173
+ roach warren,['蟑螂巢']
174
+ tunneling claws,['蟑螂埋地']
175
+ glial reconstitution,['蟑螂速度']
176
+ concussive shells,['震撼弹']
177
+ stalker,['追猎']
178
+ disruptor,['自爆球']
179
+ protross,['神族']
180
+ gas,['气矿']
181
+ hatch,['基地']
182
+ supply,['人口']
183
+ macro,['运营']
184
+ natural,['二矿']
185
+ proxy,['前置']
186
+ kite,['甩']
187
+ cyber core,['控制芯核']
188
+ saturate,['满采']
189
+ nydusworm,['虫洞']
190
+ float,['余钱']
191
+ transfuse,['加血']
192
+ chrono boost,['加速']
193
+ war prism,['棱镜']
pipeline.py CHANGED
@@ -9,6 +9,7 @@ import whisper
9
  from srt2ass import srt2ass
10
  import logging
11
  from datetime import datetime
 
12
 
13
  import subprocess
14
 
@@ -109,7 +110,10 @@ def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file =
109
 
110
  # use stable-whisper
111
  elif method == "stable":
112
- model = stable_whisper.load_model(whisper_model)
 
 
 
113
  transcript = model.transcribe(audio_path, regroup = False, initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
114
  (
115
  transcript
@@ -265,7 +269,9 @@ def main():
265
 
266
  audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
267
 
268
- logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler("{}/{}_{}.log".format(args.log_dir, VIDEO_NAME, datetime.now().strftime("%m%d%Y_%H%M%S")))], encoding='utf-8')
 
 
269
  logging.info("---------------------Video Info---------------------")
270
  logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
271
 
@@ -313,7 +319,7 @@ def main():
313
  os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
314
 
315
  end_time = time.time()
316
- logging.info("Pipeline finished, time duration:{}".format(start_time - end_time))
317
 
318
  if __name__ == "__main__":
319
  main()
 
9
  from srt2ass import srt2ass
10
  import logging
11
  from datetime import datetime
12
+ import torch
13
 
14
  import subprocess
15
 
 
110
 
111
  # use stable-whisper
112
  elif method == "stable":
113
+
114
+ # use cuda if available
115
+ devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
116
+ model = stable_whisper.load_model(whisper_model, device = devices)
117
  transcript = model.transcribe(audio_path, regroup = False, initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
118
  (
119
  transcript
 
269
 
270
  audio_path, audio_file, video_path, VIDEO_NAME = get_sources(args, DOWNLOAD_PATH, RESULT_PATH, VIDEO_NAME)
271
 
272
+ if not os.path.exists(args.log_dir):
273
+ os.makedirs(args.log_dir)
274
+ logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler("{}/{}_{}.log".format(args.log_dir, VIDEO_NAME, datetime.now().strftime("%m%d%Y_%H%M%S")), 'w', encoding='utf-8')])
275
  logging.info("---------------------Video Info---------------------")
276
  logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
277
 
 
319
  os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
320
 
321
  end_time = time.time()
322
+ logging.info("Pipeline finished, time duration:{}".format(time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))))
323
 
324
  if __name__ == "__main__":
325
  main()