Eason Lu commited on
Commit
5f10ef2
1 Parent(s): 01a25ac

solve milliseconds error

Browse files

Former-commit-id: 3820bdd8e592b1154620f906a602ea0d7a3a5373

Files changed (2) hide show
  1. SRT.py +15 -3
  2. pipeline.py +8 -48
SRT.py CHANGED
@@ -8,8 +8,18 @@ class SRT_segment(object):
8
  def __init__(self, *args) -> None:
9
  if isinstance(args[0], dict):
10
  segment = args[0]
11
- self.start_time_str = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
12
- self.end_time_str = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
 
 
 
 
 
 
 
 
 
 
13
  self.source_text = segment['text']
14
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
15
  self.translation = ""
@@ -66,7 +76,7 @@ class SRT_script():
66
  return final_seg
67
 
68
  def form_whole_sentence(self):
69
- merge_list = [] # a list of indices that should be merged e.g. [[0], [2, 3, 4], [5, 6], [7]]
70
  sentence = []
71
  for i, seg in enumerate(self.segments):
72
  if seg.source_text[-1] == '.':
@@ -100,6 +110,8 @@ class SRT_script():
100
  if i < len(lines):
101
  if "(Note:" in lines[i]: # to avoid note
102
  lines.remove(lines[i])
 
 
103
  seg.translation = lines[i].split(":")[1]
104
  pass
105
 
 
8
  def __init__(self, *args) -> None:
9
  if isinstance(args[0], dict):
10
  segment = args[0]
11
+ start_ms = int((segment['start']*100)%100*10)
12
+ end_ms = int((segment['end']*100)%100*10)
13
+ start_time = str(timedelta(seconds=int(segment['start']), milliseconds=start_ms))
14
+ end_time = str(timedelta(seconds=int(segment['end']), milliseconds=end_ms))
15
+ if start_ms == 0:
16
+ self.start_time_str = str(0)+start_time.split('.')[0]+',000'
17
+ else:
18
+ self.start_time_str = str(0)+start_time.split('.')[0]+','+start_time.split('.')[1][:3]
19
+ if end_ms == 0:
20
+ self.end_time_str = str(0)+end_time.split('.')[0]+',000'
21
+ else:
22
+ self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
23
  self.source_text = segment['text']
24
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
25
  self.translation = ""
 
76
  return final_seg
77
 
78
  def form_whole_sentence(self):
79
+ merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
80
  sentence = []
81
  for i, seg in enumerate(self.segments):
82
  if seg.source_text[-1] == '.':
 
110
  if i < len(lines):
111
  if "(Note:" in lines[i]: # to avoid note
112
  lines.remove(lines[i])
113
+ if i == len(lines) - 1:
114
+ break
115
  seg.translation = lines[i].split(":")[1]
116
  pass
117
 
pipeline.py CHANGED
@@ -89,7 +89,6 @@ srt_file_en = args.srt_file
89
 
90
  if srt_file_en is not None:
91
  srt = SRT_script.parse_from_srt_file(srt_file_en)
92
- script_input = srt.get_source_only()
93
  else:
94
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
95
  srt_file_en = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
@@ -115,9 +114,7 @@ else:
115
  # transcript.to_srt_vtt(srt_file_en)
116
  transcript = transcript.to_dict()
117
  srt = SRT_script(transcript['segments']) # read segments to SRT class
118
- srt.form_whole_sentence()
119
- script_input = srt.get_source_only()
120
- srt.write_srt_file_src(srt_file_en)
121
  #Write SRT file
122
 
123
  # from whisper.utils import WriteSRT
@@ -126,55 +123,18 @@ else:
126
  # writer.write_result(transcript, f)
127
  else:
128
  srt = SRT_script.parse_from_srt_file(srt_file_en)
129
- script_input = srt.get_source_only()
 
 
 
 
 
130
 
131
  if not args.only_srt:
132
  from srt2ass import srt2ass
133
  assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
134
  print('ASS subtitle saved as: ' + assSub_en)
135
-
136
- # # force translate the starcraft2 term into chinese according to the dict
137
- # # TODO: shortcut translation i.e. VA, ob
138
- # # TODO: variety of translation
139
- # from csv import reader
140
- # import re
141
-
142
- # # read dict
143
- # with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
144
- # csv_reader = reader(f)
145
- # term_dict = {rows[0]:rows[1] for rows in csv_reader}
146
-
147
- # def clean_timestamp(lines):
148
- # new_lines = []
149
- # strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}') # 注意用4个\\\\来替换\
150
- # new_lines = strinfo.sub('_-_', lines)
151
- # print(new_lines)
152
- # return new_lines
153
-
154
-
155
- # ready_lines = re.sub('\n', '\n ', script_input)
156
- # ready_words = ready_lines.split(" ")
157
- # i = 0
158
- # while i < len(ready_words):
159
- # word = ready_words[i]
160
- # if word[-2:] == ".\n" :
161
- # if word[:-2].lower() in term_dict :
162
- # new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
163
- # ready_words[i] = new_word
164
- # else :
165
- # word += ' '
166
- # ready_words[i] = word
167
- # elif word.lower() in term_dict :
168
- # new_word = word.replace(word,term_dict.get(word.lower())) + ' '
169
- # ready_words[i] = new_word
170
- # else :
171
- # word += " "
172
- # ready_words[i]= word
173
- # i += 1
174
-
175
- # script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
176
-
177
- # srt.correct_with_force_term()
178
 
179
  # Split the video script by sentences and create chunks within the token limit
180
  def script_split(script_in, chunk_size = 1000):
 
89
 
90
  if srt_file_en is not None:
91
  srt = SRT_script.parse_from_srt_file(srt_file_en)
 
92
  else:
93
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
94
  srt_file_en = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
 
114
  # transcript.to_srt_vtt(srt_file_en)
115
  transcript = transcript.to_dict()
116
  srt = SRT_script(transcript['segments']) # read segments to SRT class
117
+
 
 
118
  #Write SRT file
119
 
120
  # from whisper.utils import WriteSRT
 
123
  # writer.write_result(transcript, f)
124
  else:
125
  srt = SRT_script.parse_from_srt_file(srt_file_en)
126
+
127
+ # srt preprocess
128
+ srt.form_whole_sentence()
129
+ srt.correct_with_force_term()
130
+ srt.write_srt_file_src(srt_file_en)
131
+ script_input = srt.get_source_only()
132
 
133
  if not args.only_srt:
134
  from srt2ass import srt2ass
135
  assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
136
  print('ASS subtitle saved as: ' + assSub_en)
137
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  # Split the video script by sentences and create chunks within the token limit
140
  def script_split(script_in, chunk_size = 1000):