Spaces:
Sleeping
Sleeping
DWizard
commited on
Commit
•
5a7c441
1
Parent(s):
6d9ba90
add force change term into chinese before tranlate
Browse filesFormer-commit-id: 2b6ec94ed31ea352361c1591a6db9a4b3e775fb3
- finetune_data/dict.csv +173 -0
- pipeline.py +45 -3
finetune_data/dict.csv
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
barracks,兵营
|
2 |
+
engineering bay,工程站
|
3 |
+
forge,锻炉
|
4 |
+
blink,闪现
|
5 |
+
evolution chamber,进化腔
|
6 |
+
cybernetics core,控制芯核
|
7 |
+
enhanced shockwaves,EMP范围
|
8 |
+
gravitic boosters,ob速度
|
9 |
+
armory,军械库
|
10 |
+
robotics bay,机械研究所
|
11 |
+
twilight council,光影议会
|
12 |
+
fusion core,聚变芯体
|
13 |
+
fleet beacon,舰队航标
|
14 |
+
factory,重工厂
|
15 |
+
ghost academy,幽灵军校
|
16 |
+
infestation pit,感染深渊
|
17 |
+
robotics facility,机械台
|
18 |
+
stargate,星门
|
19 |
+
starport,星港
|
20 |
+
archon,白球
|
21 |
+
smart servos,变形加速
|
22 |
+
gateway,兵营
|
23 |
+
warpgate,兵营
|
24 |
+
immortal,不朽
|
25 |
+
zealot,叉叉
|
26 |
+
nydus network,虫洞
|
27 |
+
nydus worm,虫洞
|
28 |
+
hydralisk,刺蛇
|
29 |
+
grooved spines,刺蛇射程
|
30 |
+
muscular augments,刺蛇速度
|
31 |
+
hydralisk den,刺蛇塔
|
32 |
+
planetary fortress,大地堡
|
33 |
+
battle cruiser,大和
|
34 |
+
weapon refit,大和炮
|
35 |
+
brood lord,大龙
|
36 |
+
greater spire,大龙塔
|
37 |
+
anabolic synthesis,大牛速度
|
38 |
+
cyclone,导弹车
|
39 |
+
bunker,地堡
|
40 |
+
lurker,地刺
|
41 |
+
seismic spines,地刺射程
|
42 |
+
adaptive talons,地刺速埋
|
43 |
+
lurker den,地刺塔
|
44 |
+
widow mine,地雷
|
45 |
+
ground carapace,地面单位甲壳等级
|
46 |
+
high templar,电兵
|
47 |
+
shield battery,电池
|
48 |
+
observer,叮当
|
49 |
+
baneling,毒爆
|
50 |
+
centrifugal hooks,毒爆速度
|
51 |
+
baneling nest,毒爆塔
|
52 |
+
raven,渡鸦
|
53 |
+
combat shield,盾
|
54 |
+
shield,盾
|
55 |
+
lair,二本
|
56 |
+
missile turret,防空
|
57 |
+
spore crawler,防空
|
58 |
+
supply depot,房子
|
59 |
+
overlord,房子
|
60 |
+
pneumatized carapace,房子速度
|
61 |
+
mutalisk,飞龙
|
62 |
+
spire,飞龙塔
|
63 |
+
viper,飞蛇
|
64 |
+
flyer attacks,飞行生物攻击等级
|
65 |
+
flyer carapace,飞行生物甲壳等级
|
66 |
+
tempest,风暴
|
67 |
+
tectonic destabilizers,风暴伤害
|
68 |
+
phoenix,凤凰
|
69 |
+
anion pulse-crystals,凤凰射程
|
70 |
+
corruptor,腐化
|
71 |
+
infestor,感染虫
|
72 |
+
pathogen glands,感染能量
|
73 |
+
zergling,狗
|
74 |
+
spawning pool,狗池
|
75 |
+
metabolic boost,狗速
|
76 |
+
spine crawler,管子
|
77 |
+
marauder,光头
|
78 |
+
ghost,鬼兵
|
79 |
+
arm silo with nuke,核弹
|
80 |
+
carrier,黄金舰队
|
81 |
+
hellion,火车
|
82 |
+
hellbat,火车侠
|
83 |
+
ravager,火蟑螂
|
84 |
+
nexus,基地
|
85 |
+
hatchery,基地
|
86 |
+
command center,基地
|
87 |
+
neosteel armor,建筑护甲
|
88 |
+
hi-sec auto tracking,建筑射程
|
89 |
+
ship weapons,舰船武器等级
|
90 |
+
charge,脚速
|
91 |
+
liberator,解放
|
92 |
+
advanced ballistics,解放射程
|
93 |
+
melee attacks,近战攻击等级
|
94 |
+
colossus,巨像
|
95 |
+
extended thermal lance,巨像射程
|
96 |
+
creep tumor,菌毯
|
97 |
+
tech lab,科技挂件
|
98 |
+
air armor,空中单位护甲等级
|
99 |
+
air weapons,空中单位武器等级
|
100 |
+
adrenal glands,狂狗
|
101 |
+
mule,矿螺
|
102 |
+
infernal pre-igniter,蓝火
|
103 |
+
thor,雷神
|
104 |
+
warp prism,棱镜
|
105 |
+
gravitic drive,棱镜速度
|
106 |
+
dragoon,龙骑士
|
107 |
+
cocoon,卵
|
108 |
+
larva,卵
|
109 |
+
mothership,妈妈船
|
110 |
+
burrow,埋地
|
111 |
+
changeling,拟态虫
|
112 |
+
ultralisk,牛
|
113 |
+
chitinous plating,牛甲
|
114 |
+
ultralisk cavern,牛塔
|
115 |
+
drone,农民
|
116 |
+
scv,农民
|
117 |
+
queen,女王
|
118 |
+
banshee,女妖
|
119 |
+
hyperflight rotors,女妖提速
|
120 |
+
photon cannon,炮台
|
121 |
+
missile attacks,喷射攻击等级
|
122 |
+
assimilator,气矿
|
123 |
+
extractor,气矿
|
124 |
+
refinery,气矿
|
125 |
+
roach,钱赞企
|
126 |
+
marine,枪兵
|
127 |
+
sensor tower,圈
|
128 |
+
infantry armor,人族防
|
129 |
+
infantry weapons,人族攻
|
130 |
+
hive,三本
|
131 |
+
psionic storm,闪电
|
132 |
+
templar archives,闪电塔
|
133 |
+
sentry,哨兵
|
134 |
+
ground armor,神族防
|
135 |
+
ground weapons,神族攻
|
136 |
+
adept,使徒
|
137 |
+
resonating glaives,使徒攻速
|
138 |
+
reactor,双倍挂件
|
139 |
+
pylon,水晶
|
140 |
+
reaper,死神
|
141 |
+
drilling claws,速埋
|
142 |
+
swarm host,宿主
|
143 |
+
mag-field accelerator,锁定增伤
|
144 |
+
siege tank,坦克
|
145 |
+
probe,探机
|
146 |
+
corvid reactor,铁鸦能量
|
147 |
+
neural parasite,同化完成
|
148 |
+
viking,维京
|
149 |
+
oracle,先知
|
150 |
+
broodling,小虫子
|
151 |
+
locust,小虫子
|
152 |
+
mothership core,小妈妈船
|
153 |
+
orbital command,星轨
|
154 |
+
stimpack,兴奋剂
|
155 |
+
void ray,虚空
|
156 |
+
flux vanes,虚空速度
|
157 |
+
overseer,眼虫
|
158 |
+
ignite afterburners,医疗机速度
|
159 |
+
dark templar,隐刀
|
160 |
+
shadow stride,隐刀闪现
|
161 |
+
dark shrine,隐刀塔
|
162 |
+
cloaking field,隐形
|
163 |
+
personal cloaking,隐形
|
164 |
+
medivac dropship,运输机
|
165 |
+
vehicle and ship plating,战车及舰船钢板等级
|
166 |
+
vehicle weapons,战车武器等级
|
167 |
+
war hound,战狼
|
168 |
+
roach warren,蟑螂巢
|
169 |
+
tunneling claws,蟑螂埋地
|
170 |
+
glial reconstitution,蟑螂速度
|
171 |
+
concussive shells,震撼弹
|
172 |
+
stalker,追猎
|
173 |
+
disruptor,自爆球
|
pipeline.py
CHANGED
@@ -89,7 +89,7 @@ if not os.path.exists(f'{RESULT_PATH}/{VIDEO_NAME}'):
|
|
89 |
# Instead of using the script_en variable directly, we'll use script_input
|
90 |
srt_file_en = args.srt_file
|
91 |
if srt_file_en is not None:
|
92 |
-
with open(srt_file_en, 'r') as f:
|
93 |
script_input = f.read()
|
94 |
else:
|
95 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
@@ -110,7 +110,7 @@ else:
|
|
110 |
writer.write_result(transcript, srt)
|
111 |
|
112 |
# split the video script(open ai prompt limit: about 5000)
|
113 |
-
with open(srt_file_en, 'r') as f:
|
114 |
script_en = f.read()
|
115 |
script_input = script_en
|
116 |
|
@@ -119,9 +119,51 @@ if not args.only_srt:
|
|
119 |
assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
|
120 |
print('ASS subtitle saved as: ' + assSub_en)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
# Split the video script by sentences and create chunks within the token limit
|
123 |
n_threshold = 1500 # Token limit for the GPT-3 model
|
124 |
-
script_split =
|
125 |
|
126 |
script_arr = []
|
127 |
script = ""
|
|
|
89 |
# Instead of using the script_en variable directly, we'll use script_input
|
90 |
srt_file_en = args.srt_file
|
91 |
if srt_file_en is not None:
|
92 |
+
with open(srt_file_en, 'r', encoding='utf-8') as f:
|
93 |
script_input = f.read()
|
94 |
else:
|
95 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
|
|
110 |
writer.write_result(transcript, srt)
|
111 |
|
112 |
# split the video script(open ai prompt limit: about 5000)
|
113 |
+
with open(srt_file_en, 'r', encoding='utf-8') as f:
|
114 |
script_en = f.read()
|
115 |
script_input = script_en
|
116 |
|
|
|
119 |
assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
|
120 |
print('ASS subtitle saved as: ' + assSub_en)
|
121 |
|
122 |
+
# force translate the starcraft2 term into chinese according to the dict
|
123 |
+
# TODO: shortcut translation i.e. VA, ob
|
124 |
+
# TODO: variety of translation
|
125 |
+
from csv import reader
|
126 |
+
import re
|
127 |
+
|
128 |
+
# read dict
|
129 |
+
with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
|
130 |
+
csv_reader = reader(f)
|
131 |
+
term_dict = {rows[0]:rows[1] for rows in csv_reader}
|
132 |
+
|
133 |
+
def clean_timestamp(lines):
|
134 |
+
new_lines = []
|
135 |
+
strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}') # 注意用4个\\\\来替换\
|
136 |
+
new_lines = strinfo.sub('_-_', lines)
|
137 |
+
print(new_lines)
|
138 |
+
return new_lines
|
139 |
+
|
140 |
+
|
141 |
+
ready_lines = re.sub('\n', '\n ', script_input)
|
142 |
+
ready_words = ready_lines.split(" ")
|
143 |
+
i = 0
|
144 |
+
while i < len(ready_words):
|
145 |
+
word = ready_words[i]
|
146 |
+
if word[-2:] == ".\n" :
|
147 |
+
if word[:-2].lower() in term_dict :
|
148 |
+
new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
|
149 |
+
ready_words[i] = new_word
|
150 |
+
else :
|
151 |
+
word += ' '
|
152 |
+
ready_words[i] = word
|
153 |
+
elif word.lower() in term_dict :
|
154 |
+
new_word = word.replace(word,term_dict.get(word.lower())) + ' '
|
155 |
+
ready_words[i] = new_word
|
156 |
+
else :
|
157 |
+
word += " "
|
158 |
+
ready_words[i]= word
|
159 |
+
i += 1
|
160 |
+
|
161 |
+
script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
|
162 |
+
|
163 |
+
|
164 |
# Split the video script by sentences and create chunks within the token limit
|
165 |
n_threshold = 1500 # Token limit for the GPT-3 model
|
166 |
+
script_split = script_input_withForceTerm.split('.')
|
167 |
|
168 |
script_arr = []
|
169 |
script = ""
|