Spaces:
Sleeping
Sleeping
DWizard
commited on
Commit
•
be41edb
1
Parent(s):
5e8cc27
new dict utils need prune
Browse filesFormer-commit-id: c3d204c86e52f7ef0958fd59487a12505addea3c
- dict_util.py +26 -1
- domain_dict/SC2/EN.csv +36 -36
- domain_dict/SC2/ZH.csv +34 -34
- src/srt_util/srt.py +19 -18
dict_util.py
CHANGED
@@ -52,4 +52,29 @@ with open("../test.csv", "w", encoding='utf-8') as w:
|
|
52 |
export_csv_dict(term_dict_sc2,w)
|
53 |
|
54 |
## for load pickle, just:
|
55 |
-
# pickle.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
export_csv_dict(term_dict_sc2,w)
|
53 |
|
54 |
## for load pickle, just:
|
55 |
+
# pickle.load(f)
|
56 |
+
|
57 |
+
|
58 |
+
def form_dict(src_path,tgt_path) -> dict:
|
59 |
+
final_dict = {}
|
60 |
+
with open(src_path, 'r', encoding="utf-8") as file:
|
61 |
+
src_dict = list(csv.reader(file, delimiter=","))
|
62 |
+
with open(tgt_path, 'r', encoding="utf-8") as file:
|
63 |
+
tgt_dict = list(csv.reader(file, delimiter="," ))
|
64 |
+
for idx, value in enumerate(src_dict):
|
65 |
+
for item in value:
|
66 |
+
final_dict.update({item:tgt_dict[idx]})
|
67 |
+
return final_dict
|
68 |
+
|
69 |
+
|
70 |
+
class term_dict(dict):
|
71 |
+
def __init__(self, path, src_lang, tgt_lang) -> None:
|
72 |
+
src_dict = f"{path}/{src_lang}.csv"
|
73 |
+
tgt_dict = f"{path}/{tgt_lang}.csv"
|
74 |
+
super().__init__(form_dict(src_dict, tgt_dict))
|
75 |
+
|
76 |
+
|
77 |
+
def get(self, key:str) -> str:
|
78 |
+
word = self[key][randint(0,len(self[key])-1)]
|
79 |
+
return word
|
80 |
+
|
domain_dict/SC2/EN.csv
CHANGED
@@ -1,43 +1,43 @@
|
|
1 |
-
|
2 |
-
zerg
|
3 |
-
protoss
|
4 |
-
terran
|
5 |
engineering bay,engin bay
|
6 |
-
forge
|
7 |
-
blink
|
8 |
-
evolution chamber
|
9 |
cybernetics core,cybercore
|
10 |
-
enhanced shockwaves
|
11 |
-
gravitic boosters
|
12 |
-
armory
|
13 |
robotics bay,robo bay
|
14 |
twilight council,twilight
|
15 |
-
fusion core
|
16 |
-
fleet beacon
|
17 |
-
factory
|
18 |
-
ghost academy
|
19 |
-
infestation pit
|
20 |
robotics facility,robo
|
21 |
-
stargate
|
22 |
-
starport
|
23 |
-
archon
|
24 |
-
smart servos
|
25 |
-
gateway
|
26 |
-
warpgate
|
27 |
-
immortal
|
28 |
-
zealot
|
29 |
-
nydus network
|
30 |
-
nydus worm
|
31 |
hydralisk,hydra
|
32 |
-
grooved spines
|
33 |
-
muscular augments
|
34 |
hydralisk den,hydra den
|
35 |
-
planetary fortress
|
36 |
-
battle cruiser
|
37 |
-
weapon refit
|
38 |
-
brood lord
|
39 |
-
broodling
|
40 |
-
greater spire
|
41 |
-
anabolic synthesis
|
42 |
-
cyclone
|
43 |
-
bunker
|
|
|
1 |
+
barracks
|
2 |
+
zerg
|
3 |
+
protoss
|
4 |
+
terran
|
5 |
engineering bay,engin bay
|
6 |
+
forge
|
7 |
+
blink
|
8 |
+
evolution chamber
|
9 |
cybernetics core,cybercore
|
10 |
+
enhanced shockwaves
|
11 |
+
gravitic boosters
|
12 |
+
armory
|
13 |
robotics bay,robo bay
|
14 |
twilight council,twilight
|
15 |
+
fusion core
|
16 |
+
fleet beacon
|
17 |
+
factory
|
18 |
+
ghost academy
|
19 |
+
infestation pit
|
20 |
robotics facility,robo
|
21 |
+
stargate
|
22 |
+
starport
|
23 |
+
archon
|
24 |
+
smart servos
|
25 |
+
gateway
|
26 |
+
warpgate
|
27 |
+
immortal
|
28 |
+
zealot
|
29 |
+
nydus network
|
30 |
+
nydus worm
|
31 |
hydralisk,hydra
|
32 |
+
grooved spines
|
33 |
+
muscular augments
|
34 |
hydralisk den,hydra den
|
35 |
+
planetary fortress
|
36 |
+
battle cruiser
|
37 |
+
weapon refit
|
38 |
+
brood lord
|
39 |
+
broodling
|
40 |
+
greater spire
|
41 |
+
anabolic synthesis
|
42 |
+
cyclone
|
43 |
+
bunker
|
domain_dict/SC2/ZH.csv
CHANGED
@@ -1,43 +1,43 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
工程站,BE
|
6 |
BF,锻炉
|
7 |
-
|
8 |
-
|
9 |
BY,赛博核心
|
10 |
-
EMP
|
11 |
-
ob
|
12 |
-
|
13 |
机械研究所,VB
|
14 |
光影议会,VC
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
VR,机械台
|
21 |
神族VS,星门
|
22 |
星港,人族VS
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
大地堡,行星要塞
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
1 |
+
兵营
|
2 |
+
虫族
|
3 |
+
神族
|
4 |
+
人族
|
5 |
工程站,BE
|
6 |
BF,锻炉
|
7 |
+
闪现
|
8 |
+
进化腔
|
9 |
BY,赛博核心
|
10 |
+
EMP范围
|
11 |
+
ob速度
|
12 |
+
军械库
|
13 |
机械研究所,VB
|
14 |
光影议会,VC
|
15 |
+
聚变芯体
|
16 |
+
舰队航标
|
17 |
+
重工厂
|
18 |
+
幽灵军校
|
19 |
+
感染深渊
|
20 |
VR,机械台
|
21 |
神族VS,星门
|
22 |
星港,人族VS
|
23 |
+
白球
|
24 |
+
变形加速
|
25 |
+
传送门
|
26 |
+
折跃门
|
27 |
+
不朽
|
28 |
+
叉叉
|
29 |
+
虫道网络
|
30 |
+
坑道虫
|
31 |
+
刺蛇
|
32 |
+
刺蛇射程
|
33 |
+
刺蛇速度
|
34 |
+
刺蛇塔
|
35 |
大地堡,行星要塞
|
36 |
+
大和
|
37 |
+
大和炮
|
38 |
+
大龙
|
39 |
+
巢虫
|
40 |
+
大龙塔
|
41 |
+
大牛速度
|
42 |
+
导弹车
|
43 |
+
地堡
|
src/srt_util/srt.py
CHANGED
@@ -7,6 +7,7 @@ from datetime import timedelta
|
|
7 |
import logging
|
8 |
import openai
|
9 |
from tqdm import tqdm
|
|
|
10 |
|
11 |
# punctuation dictionary for supported languages
|
12 |
punctuation_dict = {
|
@@ -161,9 +162,13 @@ class SrtScript(object):
|
|
161 |
if self.domain != "General":
|
162 |
if os.path.exists(f"{dict_path}/{self.domain}"):
|
163 |
# TODO: load dictionary
|
|
|
|
|
164 |
...
|
165 |
else:
|
166 |
-
logging.error(f"domain {self.domain} doesn't exist")
|
|
|
|
|
167 |
|
168 |
@classmethod
|
169 |
def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
|
@@ -444,23 +449,19 @@ class SrtScript(object):
|
|
444 |
if self.domain == "General":
|
445 |
logging.info("General domain could not perform correct_with_force_term. skip this step.")
|
446 |
pass
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
logging.info(
|
461 |
-
"replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(
|
462 |
-
i + 1))
|
463 |
-
logging.info("source text becomes: " + seg.source_text)
|
464 |
|
465 |
comp_dict = []
|
466 |
|
|
|
7 |
import logging
|
8 |
import openai
|
9 |
from tqdm import tqdm
|
10 |
+
import dict_util
|
11 |
|
12 |
# punctuation dictionary for supported languages
|
13 |
punctuation_dict = {
|
|
|
162 |
if self.domain != "General":
|
163 |
if os.path.exists(f"{dict_path}/{self.domain}"):
|
164 |
# TODO: load dictionary
|
165 |
+
self.dict = dict_util.term_dict(f"{dict_path}/{self.domain}", src_lang, tgt_lang)
|
166 |
+
print(self.dict["robo"])
|
167 |
...
|
168 |
else:
|
169 |
+
logging.error(f"domain {self.domain} doesn't exist, fallback to general domain, this will disable correct_with_force_term and spell_check_term")
|
170 |
+
self.domain = "General"
|
171 |
+
|
172 |
|
173 |
@classmethod
|
174 |
def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
|
|
|
449 |
if self.domain == "General":
|
450 |
logging.info("General domain could not perform correct_with_force_term. skip this step.")
|
451 |
pass
|
452 |
+
else:
|
453 |
+
keywords = list(self.dict.keys())
|
454 |
+
keywords.sort(key=lambda x: len(x), reverse=True)
|
455 |
+
|
456 |
+
for word in keywords:
|
457 |
+
for i, seg in enumerate(self.segments):
|
458 |
+
if word in seg.source_text.lower():
|
459 |
+
seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(self.dict.get(word)),
|
460 |
+
seg.source_text, flags=re.IGNORECASE)
|
461 |
+
logging.info(
|
462 |
+
"replace term: " + word + " --> " + self.dict.get(word) + " in time stamp {}".format(
|
463 |
+
i + 1))
|
464 |
+
logging.info("source text becomes: " + seg.source_text)
|
|
|
|
|
|
|
|
|
465 |
|
466 |
comp_dict = []
|
467 |
|