Maximofn commited on
Commit
61afcdf
1 Parent(s): 8eae243
add_subtitles_to_video.py CHANGED
@@ -8,15 +8,25 @@ import os
8
  DEBUG = False
9
 
10
  COLOR_BLUE = (255, 0, 0)
 
11
  COLOR_GREEN = (0, 255, 0)
 
12
  COLOR_RED = (0, 0, 255)
 
13
  COLOR_YELLOW = (0, 255, 255)
 
14
  COLOR_WHITE = (255, 255, 255)
 
15
  COLOR_BLACK = (0, 0, 0)
16
- COLOR_BROWN = (0, 255, 255)
 
 
17
  COLOR_MAGENTA = (255, 0, 255)
 
18
  COLOR_ORANGE = (0, 165, 255)
 
19
  COLOR_PURPLE = (128, 0, 128)
 
20
  COLOR_GRAY = (128, 128, 128)
21
 
22
  def replace_characters_that_opencv_cant_show(text):
@@ -25,13 +35,13 @@ def replace_characters_that_opencv_cant_show(text):
25
  text = text.replace("í", "i")
26
  text = text.replace("ó", "o")
27
  text = text.replace("ú", "u")
28
- text = text.replace("ñ", "n")
29
  text = text.replace("Á", "A")
30
  text = text.replace("É", "E")
31
  text = text.replace("Í", "I")
32
  text = text.replace("Ó", "O")
33
  text = text.replace("Ú", "U")
34
- text = text.replace("Ñ", "N")
35
  text = text.replace("\n", "")
36
  text = text.replace("¿", "?")
37
  text = text.replace("¡", "!")
@@ -47,30 +57,40 @@ def remove_speaker_text(text):
47
  text = text[prefix_len:] # Remove the matched text from the beginning
48
  return text, speaker
49
 
50
- def get_filter_text_and_speaker(text, color):
51
  text, speaker = remove_speaker_text(text)
52
  if speaker is not None:
53
  if speaker == 0:
54
  color = COLOR_GREEN
 
55
  elif speaker == 1:
56
  color = COLOR_BLUE
 
57
  elif speaker == 2:
58
  color = COLOR_RED
 
59
  elif speaker == 3:
60
  color = COLOR_YELLOW
 
61
  elif speaker == 4:
62
  color = COLOR_WHITE
 
63
  elif speaker == 5:
64
  color = COLOR_BLACK
 
65
  elif speaker == 6:
66
  color = COLOR_BROWN
 
67
  elif speaker == 7:
68
  color = COLOR_MAGENTA
 
69
  elif speaker == 8:
70
  color = COLOR_ORANGE
 
71
  elif speaker == 9:
72
  color = COLOR_PURPLE
73
- return text, color
 
74
 
75
  def create_dict_of_transcription(transcription_file):
76
  transcription_dict = {}
@@ -85,7 +105,7 @@ def create_dict_of_transcription(transcription_file):
85
  # Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
86
  start, end = line.split(" --> ")
87
  # Add key to dictionary
88
- transcription_dict[start] = ""
89
 
90
  # if line is a number and carriage continue
91
  elif re.match(r"\d+$", line):
@@ -99,11 +119,13 @@ def create_dict_of_transcription(transcription_file):
99
  else:
100
  # Remove characters that opencv can't show
101
  line = replace_characters_that_opencv_cant_show(line)
102
- transcription_dict[start] += f"{line}\n"
103
 
104
  return transcription_dict
105
 
106
  def hour_minute_seconds_miliseconds_to_seconds(time):
 
 
107
  hours, minutes, seconds_miliseconds = time.split(":")
108
  seconds, miliseconds = seconds_miliseconds.split(",")
109
  seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
@@ -131,9 +153,12 @@ def search_transcription_in_dict_of_transcription(transcription_dict, seconds):
131
  key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
132
  next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
133
  if key_seconds <= seconds and seconds < next_key_seconds:
134
- return transcription_dict[key_hmsms]
 
 
135
  else:
136
  continue
 
137
 
138
  def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
139
  text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
@@ -167,7 +192,8 @@ def add_subtitles_to_video(transcription_dict, input_video_file):
167
  fontFace = cv2.FONT_HERSHEY_DUPLEX
168
  fontScale = 1
169
  thickness = 2
170
- color = (0, 255, 0)
 
171
  lineType = cv2.LINE_AA
172
  bottomLeftOrigin = False
173
 
@@ -180,53 +206,58 @@ def add_subtitles_to_video(transcription_dict, input_video_file):
180
 
181
  # Add the text to the frame
182
  current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
183
- text = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
184
  if text is not None:
185
- if text[-1] == "\n":
186
- text = text[:-1]
187
- if text[-1] == " ":
188
- text = text[:-1]
 
189
  if old_text != text:
190
  old_text = text
191
  text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
192
- if text_length > captured_video_width:
193
- necesary_rows = int(text_length // (captured_video_width-100)+1)
194
- words = text.split(" ")
195
- number_of_words = len(words)
196
- words_per_row = int(number_of_words // necesary_rows)
197
- text = ""
198
- text_position = (50, int(captured_video_height)-50*(necesary_rows+1))
199
- rectangle_point1 = (40, text_position[1]-30)
200
- for i in range(number_of_words):
201
- if i % words_per_row == 0 and i != 0:
202
- text, color = get_filter_text_and_speaker(text, color)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
204
  if length_of_text > 10:
205
  rectangle_point2 = (length_of_text+50, text_position[1]+10)
206
- cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
207
  cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
208
- text = ""
209
- text_position = (50, text_position[1]+50)
210
  rectangle_point1 = (40, text_position[1]-30)
211
- text += words[i] + " "
212
- # Add the last words
213
- text, color = get_filter_text_and_speaker(text, color)
214
- length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
215
- if length_of_text > 10:
216
- rectangle_point2 = (length_of_text+50, text_position[1]+10)
217
- cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
218
- cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
219
- else:
220
- text_position = (50, int(captured_video_height)-50)
221
- rectangle_point1 = (40, text_position[1]-30)
222
- rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
223
- if text is not None:
224
- text, color = get_filter_text_and_speaker(text, color)
225
- length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
226
- if length_of_text > 10:
227
- rectangle_point2 = (length_of_text+50, text_position[1]+10)
228
- cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
229
- cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
230
 
231
  # Update the progress bar
232
  progress_bar.update(1)
 
8
  DEBUG = False
9
 
10
  COLOR_BLUE = (255, 0, 0)
11
+ BACKGROUND_FOR_COLOR_BLUE = (255, 255, 0)
12
  COLOR_GREEN = (0, 255, 0)
13
+ BACKGROUND_FOR_COLOR_GREEN = (255, 0, 255)
14
  COLOR_RED = (0, 0, 255)
15
+ BACKGROUND_FOR_COLOR_RED = (255, 255, 0)
16
  COLOR_YELLOW = (0, 255, 255)
17
+ BACKGROUND_FOR_COLOR_YELLOW = (255, 0, 0)
18
  COLOR_WHITE = (255, 255, 255)
19
+ BACKGROUND_FOR_COLOR_WHITE = (128, 128, 128)
20
  COLOR_BLACK = (0, 0, 0)
21
+ BACKGROUND_FOR_COLOR_BLACK = (128, 128, 128)
22
+ COLOR_BROWN = (202, 221, 234)
23
+ BACKGROUND_FOR_COLOR_BROWN = (234, 215, 202)
24
  COLOR_MAGENTA = (255, 0, 255)
25
+ BACKGROUND_FOR_COLOR_MAGENTA = (0, 255, 0)
26
  COLOR_ORANGE = (0, 165, 255)
27
+ BACKGROUND_FOR_COLOR_ORANGE = (255, 90, 0)
28
  COLOR_PURPLE = (128, 0, 128)
29
+ BACKGROUND_FOR_COLOR_PURPLE = (127, 255, 127)
30
  COLOR_GRAY = (128, 128, 128)
31
 
32
  def replace_characters_that_opencv_cant_show(text):
 
35
  text = text.replace("í", "i")
36
  text = text.replace("ó", "o")
37
  text = text.replace("ú", "u")
38
+ text = text.replace("ñ", "nh")
39
  text = text.replace("Á", "A")
40
  text = text.replace("É", "E")
41
  text = text.replace("Í", "I")
42
  text = text.replace("Ó", "O")
43
  text = text.replace("Ú", "U")
44
+ text = text.replace("Ñ", "NH")
45
  text = text.replace("\n", "")
46
  text = text.replace("¿", "?")
47
  text = text.replace("¡", "!")
 
57
  text = text[prefix_len:] # Remove the matched text from the beginning
58
  return text, speaker
59
 
60
+ def get_filter_text_and_speaker(text, color, background):
61
  text, speaker = remove_speaker_text(text)
62
  if speaker is not None:
63
  if speaker == 0:
64
  color = COLOR_GREEN
65
+ background = BACKGROUND_FOR_COLOR_GREEN
66
  elif speaker == 1:
67
  color = COLOR_BLUE
68
+ background = BACKGROUND_FOR_COLOR_BLUE
69
  elif speaker == 2:
70
  color = COLOR_RED
71
+ background = BACKGROUND_FOR_COLOR_RED
72
  elif speaker == 3:
73
  color = COLOR_YELLOW
74
+ background = BACKGROUND_FOR_COLOR_YELLOW
75
  elif speaker == 4:
76
  color = COLOR_WHITE
77
+ background = BACKGROUND_FOR_COLOR_WHITE
78
  elif speaker == 5:
79
  color = COLOR_BLACK
80
+ background = BACKGROUND_FOR_COLOR_BLACK
81
  elif speaker == 6:
82
  color = COLOR_BROWN
83
+ background = BACKGROUND_FOR_COLOR_BROWN
84
  elif speaker == 7:
85
  color = COLOR_MAGENTA
86
+ background = BACKGROUND_FOR_COLOR_MAGENTA
87
  elif speaker == 8:
88
  color = COLOR_ORANGE
89
+ background = BACKGROUND_FOR_COLOR_ORANGE
90
  elif speaker == 9:
91
  color = COLOR_PURPLE
92
+ background = BACKGROUND_FOR_COLOR_PURPLE
93
+ return text, color, background
94
 
95
  def create_dict_of_transcription(transcription_file):
96
  transcription_dict = {}
 
105
  # Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
106
  start, end = line.split(" --> ")
107
  # Add key to dictionary
108
+ transcription_dict[start] = {"transcription": "", "end": end}
109
 
110
  # if line is a number and carriage continue
111
  elif re.match(r"\d+$", line):
 
119
  else:
120
  # Remove characters that opencv can't show
121
  line = replace_characters_that_opencv_cant_show(line)
122
+ transcription_dict[start]["transcription"] += f"{line}\n"
123
 
124
  return transcription_dict
125
 
126
  def hour_minute_seconds_miliseconds_to_seconds(time):
127
+ if time is None:
128
+ return None
129
  hours, minutes, seconds_miliseconds = time.split(":")
130
  seconds, miliseconds = seconds_miliseconds.split(",")
131
  seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
 
153
  key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
154
  next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
155
  if key_seconds <= seconds and seconds < next_key_seconds:
156
+ transcription = transcription_dict[key_hmsms]["transcription"]
157
+ end_time = transcription_dict[key_hmsms]["end"]
158
+ return transcription, end_time
159
  else:
160
  continue
161
+ return None, None
162
 
163
  def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
164
  text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
 
192
  fontFace = cv2.FONT_HERSHEY_DUPLEX
193
  fontScale = 1
194
  thickness = 2
195
+ color = COLOR_WHITE
196
+ background = COLOR_GRAY
197
  lineType = cv2.LINE_AA
198
  bottomLeftOrigin = False
199
 
 
206
 
207
  # Add the text to the frame
208
  current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
209
+ text, end_time = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
210
  if text is not None:
211
+ if len(text) > 0:
212
+ if text[-1] == "\n":
213
+ text = text[:-1]
214
+ if text[-1] == " ":
215
+ text = text[:-1]
216
  if old_text != text:
217
  old_text = text
218
  text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
219
+ current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
220
+ end_time_seconds = hour_minute_seconds_miliseconds_to_seconds(end_time)
221
+ if current_time is not None and end_time_seconds is not None:
222
+ if current_time <= end_time_seconds:
223
+ if text_length > captured_video_width:
224
+ necesary_rows = int(text_length // (captured_video_width-300)+1)
225
+ words = text.split(" ")
226
+ number_of_words = len(words)
227
+ words_per_row = int(number_of_words // necesary_rows)
228
+ text = ""
229
+ text_position = (50, int(captured_video_height)-50*(necesary_rows+1))
230
+ rectangle_point1 = (40, text_position[1]-30)
231
+ for i in range(number_of_words):
232
+ if i % words_per_row == 0 and i != 0:
233
+ text, color, background = get_filter_text_and_speaker(text, color, background)
234
+ length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
235
+ if length_of_text > 10:
236
+ rectangle_point2 = (length_of_text+50, text_position[1]+10)
237
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
238
+ cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
239
+ text = ""
240
+ text_position = (50, text_position[1]+50)
241
+ rectangle_point1 = (40, text_position[1]-30)
242
+ text += words[i] + " "
243
+ # Add the last words
244
+ text, color, background = get_filter_text_and_speaker(text, color, background)
245
  length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
246
  if length_of_text > 10:
247
  rectangle_point2 = (length_of_text+50, text_position[1]+10)
248
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
249
  cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
250
+ else:
251
+ text_position = (50, int(captured_video_height)-50)
252
  rectangle_point1 = (40, text_position[1]-30)
253
+ rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
254
+ if text is not None:
255
+ text, color, background = get_filter_text_and_speaker(text, color, background)
256
+ length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
257
+ if length_of_text > 10:
258
+ rectangle_point2 = (length_of_text+50, text_position[1]+10)
259
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0)
260
+ cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  # Update the progress bar
263
  progress_bar.update(1)
translate_transcriptions.py CHANGED
@@ -118,8 +118,9 @@ def main(transcription_file, source_languaje, target_languaje, translate_model,
118
  translated_transcription_time_stamps += f"{line}\n"
119
  else:
120
  if (i < len(translated_transcription_list)):
121
- if translated_transcription_list[i][0] == " ": # Remove space at the beginning
122
- translated_transcription_list[i] = translated_transcription_list[i][1:]
 
123
  speaker = ""
124
  if re.match(r"\[SPEAKER_\d\d\]:", line):
125
  speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)
 
118
  translated_transcription_time_stamps += f"{line}\n"
119
  else:
120
  if (i < len(translated_transcription_list)):
121
+ if len(translated_transcription_list[i]) > 0:
122
+ if translated_transcription_list[i][0] == " ": # Remove space at the beginning
123
+ translated_transcription_list[i] = translated_transcription_list[i][1:]
124
  speaker = ""
125
  if re.match(r"\[SPEAKER_\d\d\]:", line):
126
  speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)