Maximofn commited on
Commit
c017aa5
1 Parent(s): 098e9c6

Translate big blocks of text to get more context

Browse files
Files changed (1) hide show
  1. translate_transcriptions.py +81 -13
translate_transcriptions.py CHANGED
@@ -5,6 +5,10 @@ import argparse
5
  import re
6
  from tqdm import tqdm
7
 
 
 
 
 
8
  language_dict = {}
9
  # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
10
  for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
@@ -19,8 +23,6 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
19
  "translator": language_code
20
  }
21
 
22
-
23
-
24
  def translate(transcribed_text, source_languaje, target_languaje, translate_model, translate_tokenizer, device="cpu"):
25
  # Get source and target languaje codes
26
  source_languaje_code = language_dict[source_languaje]["translator"]
@@ -44,26 +46,92 @@ def main(transcription_file, source_languaje, target_languaje, translate_model,
44
  with open(transcription_file, "r") as f:
45
  transcription = f.read().splitlines()
46
 
47
- # Translate
48
- translate_transcription = ""
49
- progress_bar = tqdm(total=len(transcription), desc='Translating transcription progress')
50
  for line in transcription:
51
  if re.match(r"\d+$", line):
52
- translate_transcription += f"{line}\n"
53
  elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
54
- translate_transcription += f"{line}\n"
55
  elif re.match(r"^$", line):
56
- translate_transcription += f"{line}\n"
57
  else:
58
- translated = translate(line, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
59
- # translated = line
60
- translate_transcription += f"{translated}\n"
61
  progress_bar.update(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- # Save translation
64
  output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}.srt"
65
  with open(output_file, "w") as f:
66
- f.write(translate_transcription)
67
 
68
  if __name__ == "__main__":
69
  parser = argparse.ArgumentParser()
 
5
  import re
6
  from tqdm import tqdm
7
 
8
+ MAX_LENGTH = 500
9
+ MAGIC_STRING = "[$&]"
10
+ DEBUG = False
11
+
12
  language_dict = {}
13
  # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
14
  for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
 
23
  "translator": language_code
24
  }
25
 
 
 
26
  def translate(transcribed_text, source_languaje, target_languaje, translate_model, translate_tokenizer, device="cpu"):
27
  # Get source and target languaje codes
28
  source_languaje_code = language_dict[source_languaje]["translator"]
 
46
  with open(transcription_file, "r") as f:
47
  transcription = f.read().splitlines()
48
 
49
+ # Concatenate transcriptions
50
+ raw_transcription = ""
51
+ progress_bar = tqdm(total=len(transcription), desc='Concatenate transcriptions progress')
52
  for line in transcription:
53
  if re.match(r"\d+$", line):
54
+ pass
55
  elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
56
+ pass
57
  elif re.match(r"^$", line):
58
+ pass
59
  else:
60
+ line = re.sub(r"\[SPEAKER_\d\d\]:", MAGIC_STRING, line)
61
+ raw_transcription += f"{line} "
 
62
  progress_bar.update(1)
63
+ progress_bar.close()
64
+
65
+ # Save raw transcription
66
+ if DEBUG:
67
+ output_file = f"{output_folder}/{transcription_file_name}_raw.srt"
68
+ with open(output_file, "w") as f:
69
+ f.write(raw_transcription)
70
+
71
+ # Split raw transcription
72
+ raw_transcription_list = raw_transcription.split(MAGIC_STRING)
73
+ if raw_transcription_list[0] == "":
74
+ raw_transcription_list = raw_transcription_list[1:]
75
+
76
+ # Concatenate transcripts and translate when length is less than MAX_LENGTH
77
+ translated_transcription = ""
78
+ concatenate_transcription = raw_transcription_list[0] + MAGIC_STRING
79
+ progress_bar = tqdm(total=len(raw_transcription_list), desc='Translate transcriptions progress')
80
+ progress_bar.update(1)
81
+ if len(raw_transcription_list) > 1:
82
+ for transcription in raw_transcription_list[1:]:
83
+ if len(concatenate_transcription) + len(transcription) < MAX_LENGTH:
84
+ concatenate_transcription += transcription + MAGIC_STRING
85
+ else:
86
+ translation = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
87
+ translated_transcription += translation
88
+ concatenate_transcription = transcription + MAGIC_STRING
89
+ progress_bar.update(1)
90
+ # Translate last part
91
+ translation = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
92
+ translated_transcription += translation
93
+ else:
94
+ translated_transcription = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
95
+ progress_bar.close()
96
+
97
+ # Save translated transcription raw
98
+ if DEBUG:
99
+ output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}_raw.srt"
100
+ with open(output_file, "w") as f:
101
+ f.write(translated_transcription)
102
+
103
+ # Read transcription
104
+ with open(transcription_file, "r") as f:
105
+ transcription = f.read().splitlines()
106
+
107
+ # Add time stamps
108
+ translated_transcription_time_stamps = ""
109
+ translated_transcription_list = translated_transcription.split(MAGIC_STRING)
110
+ progress_bar = tqdm(total=len(translated_transcription_list), desc='Add time stamps to translated transcriptions progress')
111
+ i = 0
112
+ for line in transcription:
113
+ if re.match(r"\d+$", line):
114
+ translated_transcription_time_stamps += f"{line}\n"
115
+ elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
116
+ translated_transcription_time_stamps += f"{line}\n"
117
+ elif re.match(r"^$", line):
118
+ translated_transcription_time_stamps += f"{line}\n"
119
+ else:
120
+ if (i < len(translated_transcription_list)):
121
+ if translated_transcription_list[i][0] == " ": # Remove space at the beginning
122
+ translated_transcription_list[i] = translated_transcription_list[i][1:]
123
+ speaker = ""
124
+ if re.match(r"\[SPEAKER_\d\d\]:", line):
125
+ speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)
126
+ translated_transcription_time_stamps += f"{speaker} {translated_transcription_list[i]}\n"
127
+ i += 1
128
+ progress_bar.update(1)
129
+ progress_bar.close()
130
 
131
+ # Save translated transcription
132
  output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}.srt"
133
  with open(output_file, "w") as f:
134
+ f.write(translated_transcription_time_stamps)
135
 
136
  if __name__ == "__main__":
137
  parser = argparse.ArgumentParser()