DWizard commited on
Commit
66e606c
1 Parent(s): cf5f1c9

rewrite forceTerm replacement

Browse files

Former-commit-id: e6472c129f985724e239c662cae9064e96883dde

Files changed (1) hide show
  1. SRT.py +30 -2
SRT.py CHANGED
@@ -1,6 +1,8 @@
1
  from datetime import timedelta
2
  import os
3
  import whisper
 
 
4
 
5
  class SRT_segment(object):
6
  def __init__(self, *args) -> None:
@@ -103,9 +105,35 @@ class SRT_script():
103
  f.write(self.form_bilingual_str())
104
  pass
105
 
106
- def correct_with_force_term():
107
- # force term correction
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  pass
110
 
111
 
 
1
  from datetime import timedelta
2
  import os
3
  import whisper
4
+ from csv import reader
5
+ import re
6
 
7
  class SRT_segment(object):
8
  def __init__(self, *args) -> None:
 
105
  f.write(self.form_bilingual_str())
106
  pass
107
 
108
+ def correct_with_force_term(self):
109
+ ## force term correction
110
+ # TODO: shortcut translation i.e. VA, ob
111
+ # TODO: variety of translation
112
 
113
+ # load term dictionary
114
+ with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
115
+ csv_reader = reader(f)
116
+ term_dict = {rows[0]:rows[1] for rows in csv_reader}
117
+
118
+ # change term
119
+ for seg in self.segments:
120
+ ready_words = re.sub('\n', '\n ', seg.source_text).split(" ")
121
+ for i in range(len(ready_words)):
122
+ word = ready_words[i]
123
+ if word[-2:] == ".\n" :
124
+ if word[:-2].lower() in term_dict :
125
+ new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
126
+ ready_words[i] = new_word
127
+ else:
128
+ ready_words[i] = word + ' '
129
+ elif word.lower() in term_dict :
130
+ new_word = word.replace(word,term_dict.get(word.lower())) + ' '
131
+ ready_words[i] = new_word
132
+ else :
133
+ ready_words[i]= word + ' '
134
+ seg.source_text = re.sub('\n ', '\n', "".join(ready_words))
135
+
136
+ print(self)
137
  pass
138
 
139