Spaces:
Sleeping
Sleeping
Eason Lu
commited on
Commit
·
f1a218d
1
Parent(s):
007de42
solve empty time stamp;working on split
Browse filesFormer-commit-id: 5d21ec7ff41e2e8fb5bfa670c885d5b5e0afb352
SRT.py
CHANGED
@@ -6,28 +6,30 @@ class SRT_segment(object):
|
|
6 |
def __init__(self, *args) -> None:
|
7 |
if isinstance(args[0], dict):
|
8 |
segment = args[0]
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
else:
|
20 |
-
self.start_time_str = str(0)+start_time.split('.')[0]+','+start_time.split('.')[1][:3]
|
21 |
-
if end_ms == 0:
|
22 |
-
self.end_time_str = str(0)+end_time.split('.')[0]+',000'
|
23 |
else:
|
24 |
-
self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
|
25 |
self.source_text = segment['text'][1:]
|
26 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
27 |
self.translation = ""
|
28 |
|
29 |
elif isinstance(args[0], list):
|
30 |
-
self.source_text = args[0][2]
|
31 |
self.duration = args[0][1]
|
32 |
self.start_time_str = self.duration.split(" --> ")[0]
|
33 |
self.end_time_str = self.duration.split(" --> ")[1]
|
@@ -122,12 +124,44 @@ class SRT_script():
|
|
122 |
#print(lines[i])
|
123 |
pass
|
124 |
|
125 |
-
def split_seg(self,
|
126 |
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
pass
|
128 |
|
129 |
def check_len_and_split(self, threshold):
|
130 |
# TODO: if sentence length >= threshold, split this segments to two
|
|
|
131 |
pass
|
132 |
|
133 |
def get_source_only(self):
|
|
|
6 |
def __init__(self, *args) -> None:
|
7 |
if isinstance(args[0], dict):
|
8 |
segment = args[0]
|
9 |
+
self.start = segment['start']
|
10 |
+
self.end = segment['end']
|
11 |
+
self.start_ms = int((segment['start']*100)%100*10)
|
12 |
+
self.end_ms = int((segment['end']*100)%100*10)
|
13 |
+
|
14 |
+
if self.start_ms == self.end_ms and int(segment['start']) == int(segment['end']): # avoid empty time stamp
|
15 |
+
self.end_ms+=500
|
16 |
+
|
17 |
+
self.start_time = timedelta(seconds=int(segment['start']), milliseconds=self.start_ms)
|
18 |
+
self.end_time = timedelta(seconds=int(segment['end']), milliseconds=self.end_ms)
|
19 |
+
if self.start_ms == 0:
|
20 |
+
self.start_time_str = str(0)+str(self.start_time).split('.')[0]+',000'
|
21 |
else:
|
22 |
+
self.start_time_str = str(0)+str(self.start_time).split('.')[0]+','+self.start_time.split('.')[1][:3]
|
23 |
+
if self.end_ms == 0:
|
24 |
+
self.end_time_str = str(0)+str(self.end_time).split('.')[0]+',000'
|
25 |
else:
|
26 |
+
self.end_time_str = str(0)+str(self.end_time).split('.')[0]+','+self.end_time.split('.')[1][:3]
|
27 |
self.source_text = segment['text'][1:]
|
28 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
29 |
self.translation = ""
|
30 |
|
31 |
elif isinstance(args[0], list):
|
32 |
+
self.source_text = args[0][2]
|
33 |
self.duration = args[0][1]
|
34 |
self.start_time_str = self.duration.split(" --> ")[0]
|
35 |
self.end_time_str = self.duration.split(" --> ")[1]
|
|
|
124 |
#print(lines[i])
|
125 |
pass
|
126 |
|
127 |
+
def split_seg(self, seg_idx):
|
128 |
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
129 |
+
seg = self.segments[seg_idx]
|
130 |
+
source_text = seg.source_text
|
131 |
+
translation = seg.translation
|
132 |
+
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
133 |
+
trans_commas = [m.start() for m in re.finditer(',', translation)]
|
134 |
+
src_split_idx = src_commas[len(src_commas)//2 + 1] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2]
|
135 |
+
trans_split_idx = trans_commas[len(src_commas)//2 + 1] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2]
|
136 |
+
src_seg1 = source_text[:src_split_idx]
|
137 |
+
src_seg2 = source_text[src_split_idx+1:]
|
138 |
+
trans_seg1 = translation[:trans_split_idx]
|
139 |
+
trans_seg2 = translation[trans_split_idx+1:]
|
140 |
+
start_seg1 = seg.start
|
141 |
+
end_seg1 = start_seg2 = seg.start + (seg.end - seg.start)/2
|
142 |
+
end_seg2 = seg.end
|
143 |
+
seg1_dict = {}
|
144 |
+
seg1_dict['text'] = src_seg1
|
145 |
+
seg1_dict['start'] = start_seg1
|
146 |
+
seg1_dict['end'] = end_seg1
|
147 |
+
seg1 = SRT_segment(seg1_dict)
|
148 |
+
seg1.translation = trans_seg1
|
149 |
+
|
150 |
+
seg2_dict = {}
|
151 |
+
seg2_dict['text'] = src_seg2
|
152 |
+
seg2_dict['start'] = start_seg2
|
153 |
+
seg2_dict['end'] = end_seg2
|
154 |
+
seg2 = SRT_segment(seg2_dict)
|
155 |
+
seg2.translation = trans_seg2
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
pass
|
161 |
|
162 |
def check_len_and_split(self, threshold):
|
163 |
# TODO: if sentence length >= threshold, split this segments to two
|
164 |
+
|
165 |
pass
|
166 |
|
167 |
def get_source_only(self):
|