Eason Lu commited on
Commit
36e5125
·
2 Parent(s): f1a218d 0e88845

solve conflict

Browse files

Former-commit-id: 7186c50ef2011d375f20a4bcde3ce078999d3aa6

SRT.py CHANGED
@@ -1,6 +1,5 @@
1
  from datetime import timedelta
2
  from csv import reader
3
- import re
4
 
5
  class SRT_segment(object):
6
  def __init__(self, *args) -> None:
@@ -24,7 +23,7 @@ class SRT_segment(object):
24
  self.end_time_str = str(0)+str(self.end_time).split('.')[0]+',000'
25
  else:
26
  self.end_time_str = str(0)+str(self.end_time).split('.')[0]+','+self.end_time.split('.')[1][:3]
27
- self.source_text = segment['text'][1:]
28
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
29
  self.translation = ""
30
 
@@ -215,29 +214,114 @@ class SRT_script():
215
  # TODO: variety of translation
216
 
217
  # load term dictionary
218
- with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
219
- csv_reader = reader(f)
220
- term_dict = {rows[0]:rows[1] for rows in csv_reader}
221
 
222
  # change term
223
  for seg in self.segments:
224
- ready_words = re.sub('\n', '\n ', seg.source_text).split(" ")
225
  for i in range(len(ready_words)):
226
  word = ready_words[i]
227
- if word[-2:] == ".\n" :
228
- if word[:-2].lower() in term_dict :
229
- new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
230
- ready_words[i] = new_word
231
- else:
232
- ready_words[i] = word + ' '
233
- elif word.lower() in term_dict :
234
- new_word = word.replace(word,term_dict.get(word.lower())) + ' '
235
- ready_words[i] = new_word
236
- else :
237
- ready_words[i]= word + ' '
238
- seg.source_text = re.sub('\n ', '\n', "".join(ready_words))
 
 
 
 
 
 
239
  pass
240
 
241
-
 
 
 
 
 
 
242
 
243
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datetime import timedelta
2
  from csv import reader
 
3
 
4
  class SRT_segment(object):
5
  def __init__(self, *args) -> None:
 
23
  self.end_time_str = str(0)+str(self.end_time).split('.')[0]+',000'
24
  else:
25
  self.end_time_str = str(0)+str(self.end_time).split('.')[0]+','+self.end_time.split('.')[1][:3]
26
+ self.source_text = segment['text']
27
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
28
  self.translation = ""
29
 
 
214
  # TODO: variety of translation
215
 
216
  # load term dictionary
217
+ with open("dict_enzh.csv",'r', encoding='utf-8') as f:
218
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
 
219
 
220
  # change term
221
  for seg in self.segments:
222
+ ready_words = seg.source_text.split(" ")
223
  for i in range(len(ready_words)):
224
  word = ready_words[i]
225
+ [real_word, pos] = self.get_real_word(word)
226
+ if real_word in term_enzh_dict:
227
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
228
+ else:
229
+ new_word = word
230
+ ready_words[i] = new_word
231
+ # if word[-2:] == ".\n":
232
+ # if word[:-2].lower() in term_enzh_dict:
233
+ # new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
234
+ # ready_words[i] = new_word
235
+ # else:
236
+ # ready_words[i] = word
237
+ # elif word.lower() in term_enzh_dict:
238
+ # new_word = word.replace(word,term_enzh_dict.get(word.lower()))
239
+ # ready_words[i] = new_word
240
+ # else:
241
+ # ready_words[i]= word
242
+ seg.source_text = " ".join(ready_words)
243
  pass
244
 
245
+ def spell_check_term(self):
246
+ ## known bug: I've will be replaced because i've is not in the dict
247
+
248
+
249
+ import enchant
250
+ dict = enchant.Dict('en_US')
251
+ term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
252
 
253
+ for seg in self.segments:
254
+ ready_words = seg.source_text.split(" ")
255
+ for i in range(len(ready_words)):
256
+ word = ready_words[i]
257
+ [real_word, pos] = self.get_real_word(word)
258
+ if not dict.check(real_word):
259
+ suggest = term_spellDict.suggest(real_word)
260
+ if suggest: # relax spell check
261
+ new_word = word.replace(word[:pos],suggest[0])
262
+ else:
263
+ new_word = word
264
+ ready_words[i] = new_word
265
+ # if word[-2:] == ".\n":
266
+ # real_word = word[:-2]
267
+ # if not dict.check(real_word.lower()):
268
+ # new_word = word.replace(word[:-2], term_spellDict.suggest(real_word.lower())[0])
269
+ # ready_words[i] = new_word
270
+ # elif word[-1:] in [".", "\n", ","]:
271
+ # real_word = word[:-1]
272
+ # if not dict.check(real_word.lower()):
273
+ # new_word = word.replace(word[:-1], term_spellDict.suggest(real_word.lower())[0])
274
+ # ready_words[i] = new_word
275
+ # elif not dict.check(word.lower()):
276
+ # new_word = word.replace(word,term_spellDict.suggest(word.lower())[0])
277
+ # ready_words[i] = new_word
278
+ seg.source_text = " ".join(ready_words)
279
+ pass
280
+
281
+ def spell_correction(self, word:str, arg:int):
282
+ try:
283
+ arg in [0,1]
284
+ except ValueError:
285
+ print('only 0 or 1 for argument')
286
+
287
+
288
+ def uncover(word:str):
289
+ if word[-2:] == ".\n":
290
+ real_word = word[:-2].lower()
291
+ n = -2
292
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
293
+ real_word = word[:-1].lower()
294
+ n = -1
295
+ else:
296
+ real_word = word.lower()
297
+ n = 0
298
+ return real_word, len(word)+n
299
+
300
+ real_word = uncover(word)[0]
301
+ pos = uncover(word)[1]
302
+ new_word = word
303
+ if arg == 0: # term translate mode
304
+ with open("finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
305
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
306
+ if real_word in term_enzh_dict:
307
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
308
+ elif arg == 1: # term spell check mode
309
+ import enchant
310
+ dict = enchant.Dict('en_US')
311
+ term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
312
+ if not dict.check(real_word):
313
+ if term_spellDict.suggest(real_word): # relax spell check
314
+ new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
315
+ return new_word
316
+
317
+ def get_real_word(self, word:str):
318
+ if word[-2:] == ".\n":
319
+ real_word = word[:-2].lower()
320
+ n = -2
321
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
322
+ real_word = word[:-1].lower()
323
+ n = -1
324
+ else:
325
+ real_word = word.lower()
326
+ n = 0
327
+ return real_word, len(word)+n
finetune_data/{dict.csv → dict_enzh.csv} RENAMED
@@ -1,4 +1,4 @@
1
- barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
 
1
+ barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
finetune_data/dict_freq.csv ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks,1
2
+ engineering bay,1
3
+ forge,1
4
+ blink,1
5
+ evolution chamber,1
6
+ cybernetics core,1
7
+ enhanced shockwaves,1
8
+ gravitic boosters,1
9
+ armory,1
10
+ robotics bay,1
11
+ twilight council,1
12
+ fusion core,1
13
+ fleet beacon,1
14
+ factory,1
15
+ ghost academy,1
16
+ infestation pit,1
17
+ robotics facility,1
18
+ stargate,1
19
+ starport,1
20
+ archon,1
21
+ smart servos,1
22
+ gateway,1
23
+ warpgate,1
24
+ immortal,1
25
+ zealot,1
26
+ nydus network,1
27
+ nydus worm,1
28
+ hydralisk,1
29
+ grooved spines,1
30
+ muscular augments,1
31
+ hydralisk den,1
32
+ planetary fortress,1
33
+ battle cruiser,1
34
+ weapon refit,1
35
+ brood lord,1
36
+ greater spire,1
37
+ anabolic synthesis,1
38
+ cyclone,1
39
+ bunker,1
40
+ lurker,1
41
+ seismic spines,1
42
+ adaptive talons,1
43
+ lurker den,1
44
+ widow mine,1
45
+ ground carapace,1
46
+ high templar,1
47
+ shield battery,1
48
+ observer,1
49
+ baneling,1
50
+ centrifugal hooks,1
51
+ baneling nest,1
52
+ raven,1
53
+ combat shield,1
54
+ shield,1
55
+ lair,1
56
+ missile turret,1
57
+ spore crawler,1
58
+ supply depot,1
59
+ overlord,1
60
+ pneumatized carapace,1
61
+ mutalisk,1
62
+ spire,1
63
+ viper,1
64
+ flyer attacks,1
65
+ flyer carapace,1
66
+ tempest,1
67
+ tectonic destabilizers,1
68
+ phoenix,1
69
+ anion pulse-crystals,1
70
+ corruptor,1
71
+ infestor,1
72
+ pathogen glands,1
73
+ zergling,1
74
+ spawning pool,1
75
+ metabolic boost,1
76
+ spine crawler,1
77
+ marauder,1
78
+ ghost,1
79
+ arm silo with nuke,1
80
+ carrier,1
81
+ hellion,1
82
+ hellbat,1
83
+ ravager,1
84
+ nexus,1
85
+ hatchery,1
86
+ command center,1
87
+ neosteel armor,1
88
+ hi-sec auto tracking,1
89
+ ship weapons,1
90
+ charge,1
91
+ liberator,1
92
+ advanced ballistics,1
93
+ melee attacks,1
94
+ colossus,1
95
+ extended thermal lance,1
96
+ creep tumor,1
97
+ tech lab,1
98
+ air armor,1
99
+ air weapons,1
100
+ adrenal glands,1
101
+ mule,1
102
+ infernal pre-igniter,1
103
+ thor,1
104
+ warp prism,1
105
+ gravitic drive,1
106
+ dragoon,1
107
+ cocoon,1
108
+ larva,1
109
+ mothership,1
110
+ burrow,1
111
+ changeling,1
112
+ ultralisk,1
113
+ chitinous plating,1
114
+ ultralisk cavern,1
115
+ drone,1
116
+ scv,1
117
+ queen,1
118
+ banshee,1
119
+ hyperflight rotors,1
120
+ photon cannon,1
121
+ missile attacks,1
122
+ assimilator,1
123
+ extractor,1
124
+ refinery,1
125
+ roach,1
126
+ marine,1
127
+ sensor tower,1
128
+ infantry armor,1
129
+ infantry weapons,1
130
+ hive,1
131
+ psionic storm,1
132
+ templar archives,1
133
+ sentry,1
134
+ ground armor,1
135
+ ground weapons,1
136
+ adept,1
137
+ resonating glaives,1
138
+ reactor,1
139
+ pylon,1
140
+ reaper,1
141
+ drilling claws,1
142
+ swarm host,1
143
+ mag-field accelerator,1
144
+ siege tank,1
145
+ probe,1
146
+ corvid reactor,1
147
+ neural parasite,1
148
+ viking,1
149
+ oracle,1
150
+ broodling,1
151
+ locust,1
152
+ mothership core,1
153
+ orbital command,1
154
+ stimpack,1
155
+ void ray,1
156
+ flux vanes,1
157
+ overseer,1
158
+ ignite afterburners,1
159
+ dark templar,1
160
+ shadow stride,1
161
+ dark shrine,1
162
+ cloaking field,1
163
+ personal cloaking,1
164
+ medivac dropship,1
165
+ vehicle and ship plating,1
166
+ vehicle weapons,1
167
+ war hound,1
168
+ roach warren,1
169
+ tunneling claws,1
170
+ glial reconstitution,1
171
+ concussive shells,1
172
+ stalker,1
173
+ disruptor,1
174
+ zerg,1
175
+ protross,1
176
+ terran,1
finetune_data/dict_freq.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks
2
+ engineering bay
3
+ forge
4
+ blink
5
+ evolution chamber
6
+ cybernetics core
7
+ enhanced shockwaves
8
+ gravitic boosters
9
+ armory
10
+ robotics bay
11
+ twilight council
12
+ fusion core
13
+ fleet beacon
14
+ factory
15
+ ghost academy
16
+ infestation pit
17
+ robotics facility
18
+ stargate
19
+ starport
20
+ archon
21
+ smart servos
22
+ gateway
23
+ warpgate
24
+ immortal
25
+ zealot
26
+ nydus network
27
+ nydus worm
28
+ hydralisk
29
+ grooved spines
30
+ muscular augments
31
+ hydralisk den
32
+ planetary fortress
33
+ battle cruiser
34
+ weapon refit
35
+ brood lord
36
+ greater spire
37
+ anabolic synthesis
38
+ cyclone
39
+ bunker
40
+ lurker
41
+ seismic spines
42
+ adaptive talons
43
+ lurker den
44
+ widow mine
45
+ ground carapace
46
+ high templar
47
+ shield battery
48
+ observer
49
+ baneling
50
+ centrifugal hooks
51
+ baneling nest
52
+ raven
53
+ combat shield
54
+ shield
55
+ lair
56
+ missile turret
57
+ spore crawler
58
+ supply depot
59
+ overlord
60
+ pneumatized carapace
61
+ mutalisk
62
+ spire
63
+ viper
64
+ flyer attacks
65
+ flyer carapace
66
+ tempest
67
+ tectonic destabilizers
68
+ phoenix
69
+ anion pulse-crystals
70
+ corruptor
71
+ infestor
72
+ pathogen glands
73
+ zergling
74
+ spawning pool
75
+ metabolic boost
76
+ spine crawler
77
+ marauder
78
+ ghost
79
+ arm silo with nuke
80
+ carrier
81
+ hellion
82
+ hellbat
83
+ ravager
84
+ nexus
85
+ hatchery
86
+ command center
87
+ neosteel armor
88
+ hi-sec auto tracking
89
+ ship weapons
90
+ charge
91
+ liberator
92
+ advanced ballistics
93
+ melee attacks
94
+ colossus
95
+ extended thermal lance
96
+ creep tumor
97
+ tech lab
98
+ air armor
99
+ air weapons
100
+ adrenal glands
101
+ mule
102
+ infernal pre-igniter
103
+ thor
104
+ warp prism
105
+ gravitic drive
106
+ dragoon
107
+ cocoon
108
+ larva
109
+ mothership
110
+ burrow
111
+ changeling
112
+ ultralisk
113
+ chitinous plating
114
+ ultralisk cavern
115
+ drone
116
+ scv
117
+ queen
118
+ banshee
119
+ hyperflight rotors
120
+ photon cannon
121
+ missile attacks
122
+ assimilator
123
+ extractor
124
+ refinery
125
+ roach
126
+ marine
127
+ sensor tower
128
+ infantry armor
129
+ infantry weapons
130
+ hive
131
+ psionic storm
132
+ templar archives
133
+ sentry
134
+ ground armor
135
+ ground weapons
136
+ adept
137
+ resonating glaives
138
+ reactor
139
+ pylon
140
+ reaper
141
+ drilling claws
142
+ swarm host
143
+ mag-field accelerator
144
+ siege tank
145
+ probe
146
+ corvid reactor
147
+ neural parasite
148
+ viking
149
+ oracle
150
+ broodling
151
+ locust
152
+ mothership core
153
+ orbital command
154
+ stimpack
155
+ void ray
156
+ flux vanes
157
+ overseer
158
+ ignite afterburners
159
+ dark templar
160
+ shadow stride
161
+ dark shrine
162
+ cloaking field
163
+ personal cloaking
164
+ medivac dropship
165
+ vehicle and ship plating
166
+ vehicle weapons
167
+ war hound
168
+ roach warren
169
+ tunneling claws
170
+ glial reconstitution
171
+ concussive shells
172
+ stalker
173
+ disruptor
174
+ zerg
175
+ protross
176
+ terran
177
+ starcraft