DWizard commited on
Commit
be41edb
1 Parent(s): 5e8cc27

new dict utils need prune

Browse files

Former-commit-id: c3d204c86e52f7ef0958fd59487a12505addea3c

Files changed (4) hide show
  1. dict_util.py +26 -1
  2. domain_dict/SC2/EN.csv +36 -36
  3. domain_dict/SC2/ZH.csv +34 -34
  4. src/srt_util/srt.py +19 -18
dict_util.py CHANGED
@@ -52,4 +52,29 @@ with open("../test.csv", "w", encoding='utf-8') as w:
52
  export_csv_dict(term_dict_sc2,w)
53
 
54
  ## for load pickle, just:
55
- # pickle.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  export_csv_dict(term_dict_sc2,w)
53
 
54
  ## for load pickle, just:
55
+ # pickle.load(f)
56
+
57
+
58
+ def form_dict(src_path,tgt_path) -> dict:
59
+ final_dict = {}
60
+ with open(src_path, 'r', encoding="utf-8") as file:
61
+ src_dict = list(csv.reader(file, delimiter=","))
62
+ with open(tgt_path, 'r', encoding="utf-8") as file:
63
+ tgt_dict = list(csv.reader(file, delimiter="," ))
64
+ for idx, value in enumerate(src_dict):
65
+ for item in value:
66
+ final_dict.update({item:tgt_dict[idx]})
67
+ return final_dict
68
+
69
+
70
+ class term_dict(dict):
71
+ def __init__(self, path, src_lang, tgt_lang) -> None:
72
+ src_dict = f"{path}/{src_lang}.csv"
73
+ tgt_dict = f"{path}/{tgt_lang}.csv"
74
+ super().__init__(form_dict(src_dict, tgt_dict))
75
+
76
+
77
+ def get(self, key:str) -> str:
78
+ word = self[key][randint(0,len(self[key])-1)]
79
+ return word
80
+
domain_dict/SC2/EN.csv CHANGED
@@ -1,43 +1,43 @@
1
- barracks,
2
- zerg,
3
- protoss,
4
- terran,
5
  engineering bay,engin bay
6
- forge,
7
- blink,
8
- evolution chamber,
9
  cybernetics core,cybercore
10
- enhanced shockwaves,
11
- gravitic boosters,
12
- armory,
13
  robotics bay,robo bay
14
  twilight council,twilight
15
- fusion core,
16
- fleet beacon,
17
- factory,
18
- ghost academy,
19
- infestation pit,
20
  robotics facility,robo
21
- stargate,
22
- starport,
23
- archon,
24
- smart servos,
25
- gateway,
26
- warpgate,
27
- immortal,
28
- zealot,
29
- nydus network,
30
- nydus worm,
31
  hydralisk,hydra
32
- grooved spines,
33
- muscular augments,
34
  hydralisk den,hydra den
35
- planetary fortress,
36
- battle cruiser,
37
- weapon refit,
38
- brood lord,
39
- broodling,
40
- greater spire,
41
- anabolic synthesis,
42
- cyclone,
43
- bunker,
 
1
+ barracks
2
+ zerg
3
+ protoss
4
+ terran
5
  engineering bay,engin bay
6
+ forge
7
+ blink
8
+ evolution chamber
9
  cybernetics core,cybercore
10
+ enhanced shockwaves
11
+ gravitic boosters
12
+ armory
13
  robotics bay,robo bay
14
  twilight council,twilight
15
+ fusion core
16
+ fleet beacon
17
+ factory
18
+ ghost academy
19
+ infestation pit
20
  robotics facility,robo
21
+ stargate
22
+ starport
23
+ archon
24
+ smart servos
25
+ gateway
26
+ warpgate
27
+ immortal
28
+ zealot
29
+ nydus network
30
+ nydus worm
31
  hydralisk,hydra
32
+ grooved spines
33
+ muscular augments
34
  hydralisk den,hydra den
35
+ planetary fortress
36
+ battle cruiser
37
+ weapon refit
38
+ brood lord
39
+ broodling
40
+ greater spire
41
+ anabolic synthesis
42
+ cyclone
43
+ bunker
domain_dict/SC2/ZH.csv CHANGED
@@ -1,43 +1,43 @@
1
- 兵营,
2
- 虫族,
3
- 神族,
4
- 人族,
5
  工程站,BE
6
  BF,锻炉
7
- 闪现,
8
- 进化腔,
9
  BY,赛博核心
10
- EMP范围,
11
- ob速度,
12
- 军械库,
13
  机械研究所,VB
14
  光影议会,VC
15
- 聚变芯体,
16
- 舰队航标,
17
- 重工厂,
18
- 幽灵军校,
19
- 感染深渊,
20
  VR,机械台
21
  神族VS,星门
22
  星港,人族VS
23
- 白球,
24
- 变形加速,
25
- 传送门,
26
- 折跃门,
27
- 不朽,
28
- 叉叉,
29
- 虫道网络,
30
- 坑道虫,
31
- 刺蛇,
32
- 刺蛇射程,
33
- 刺蛇速度,
34
- 刺蛇塔,
35
  大地堡,行星要塞
36
- 大和,
37
- 大和炮,
38
- 大龙,
39
- 巢虫,
40
- 大龙塔,
41
- 大牛速度,
42
- 导弹车,
43
- 地堡,
 
1
+ 兵营
2
+ 虫族
3
+ 神族
4
+ 人族
5
  工程站,BE
6
  BF,锻炉
7
+ 闪现
8
+ 进化腔
9
  BY,赛博核心
10
+ EMP范围
11
+ ob速度
12
+ 军械库
13
  机械研究所,VB
14
  光影议会,VC
15
+ 聚变芯体
16
+ 舰队航标
17
+ 重工厂
18
+ 幽灵军校
19
+ 感染深渊
20
  VR,机械台
21
  神族VS,星门
22
  星港,人族VS
23
+ 白球
24
+ 变形加速
25
+ 传送门
26
+ 折跃门
27
+ 不朽
28
+ 叉叉
29
+ 虫道网络
30
+ 坑道虫
31
+ 刺蛇
32
+ 刺蛇射程
33
+ 刺蛇速度
34
+ 刺蛇塔
35
  大地堡,行星要塞
36
+ 大和
37
+ 大和炮
38
+ 大龙
39
+ 巢虫
40
+ 大龙塔
41
+ 大牛速度
42
+ 导弹车
43
+ 地堡
src/srt_util/srt.py CHANGED
@@ -7,6 +7,7 @@ from datetime import timedelta
7
  import logging
8
  import openai
9
  from tqdm import tqdm
 
10
 
11
  # punctuation dictionary for supported languages
12
  punctuation_dict = {
@@ -161,9 +162,13 @@ class SrtScript(object):
161
  if self.domain != "General":
162
  if os.path.exists(f"{dict_path}/{self.domain}"):
163
  # TODO: load dictionary
 
 
164
  ...
165
  else:
166
- logging.error(f"domain {self.domain} doesn't exist")
 
 
167
 
168
  @classmethod
169
  def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
@@ -444,23 +449,19 @@ class SrtScript(object):
444
  if self.domain == "General":
445
  logging.info("General domain could not perform correct_with_force_term. skip this step.")
446
  pass
447
-
448
- # load term dictionary
449
- with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
450
- term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
451
-
452
- keywords = list(term_enzh_dict.keys())
453
- keywords.sort(key=lambda x: len(x), reverse=True)
454
-
455
- for word in keywords:
456
- for i, seg in enumerate(self.segments):
457
- if word in seg.source_text.lower():
458
- seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(term_enzh_dict.get(word)),
459
- seg.source_text, flags=re.IGNORECASE)
460
- logging.info(
461
- "replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(
462
- i + 1))
463
- logging.info("source text becomes: " + seg.source_text)
464
 
465
  comp_dict = []
466
 
 
7
  import logging
8
  import openai
9
  from tqdm import tqdm
10
+ import dict_util
11
 
12
  # punctuation dictionary for supported languages
13
  punctuation_dict = {
 
162
  if self.domain != "General":
163
  if os.path.exists(f"{dict_path}/{self.domain}"):
164
  # TODO: load dictionary
165
+ self.dict = dict_util.term_dict(f"{dict_path}/{self.domain}", src_lang, tgt_lang)
166
+ print(self.dict["robo"])
167
  ...
168
  else:
169
+ logging.error(f"domain {self.domain} doesn't exist, fallback to general domain, this will disable correct_with_force_term and spell_check_term")
170
+ self.domain = "General"
171
+
172
 
173
  @classmethod
174
  def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
 
449
  if self.domain == "General":
450
  logging.info("General domain could not perform correct_with_force_term. skip this step.")
451
  pass
452
+ else:
453
+ keywords = list(self.dict.keys())
454
+ keywords.sort(key=lambda x: len(x), reverse=True)
455
+
456
+ for word in keywords:
457
+ for i, seg in enumerate(self.segments):
458
+ if word in seg.source_text.lower():
459
+ seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(self.dict.get(word)),
460
+ seg.source_text, flags=re.IGNORECASE)
461
+ logging.info(
462
+ "replace term: " + word + " --> " + self.dict.get(word) + " in time stamp {}".format(
463
+ i + 1))
464
+ logging.info("source text becomes: " + seg.source_text)
 
 
 
 
465
 
466
  comp_dict = []
467