Spaces:

Maximofn
/

subtify

Running

App Files Files Community

Maximofn commited on Nov 1, 2023

Commit

c017aa5

•

1 Parent(s): 098e9c6

Translate big blocks of text to get more context

Browse files

Files changed (1) hide show

translate_transcriptions.py +81 -13

translate_transcriptions.py CHANGED Viewed

@@ -5,6 +5,10 @@ import argparse
 import re
 from tqdm import tqdm
 language_dict = {}
 # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
 for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
@@ -19,8 +23,6 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
             "translator": language_code
         }
 def translate(transcribed_text, source_languaje, target_languaje, translate_model, translate_tokenizer, device="cpu"):
     # Get source and target languaje codes
     source_languaje_code = language_dict[source_languaje]["translator"]
@@ -44,26 +46,92 @@ def main(transcription_file, source_languaje, target_languaje, translate_model,
     with open(transcription_file, "r") as f:
         transcription = f.read().splitlines()
-    # Translate
-    translate_transcription = ""
-    progress_bar = tqdm(total=len(transcription), desc='Translating transcription progress')
     for line in transcription:
         if re.match(r"\d+$", line):
-            translate_transcription += f"{line}\n"
         elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
-            translate_transcription += f"{line}\n"
         elif re.match(r"^$", line):
-            translate_transcription += f"{line}\n"
         else:
-            translated = translate(line, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
-            # translated = line
-            translate_transcription += f"{translated}\n"
         progress_bar.update(1)
-    # Save translation
     output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}.srt"
     with open(output_file, "w") as f:
-        f.write(translate_transcription)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

 import re
 from tqdm import tqdm
+MAX_LENGTH = 500
+MAGIC_STRING = "[$&]"
+DEBUG = False
 language_dict = {}
 # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
 for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
             "translator": language_code
         }
 def translate(transcribed_text, source_languaje, target_languaje, translate_model, translate_tokenizer, device="cpu"):
     # Get source and target languaje codes
     source_languaje_code = language_dict[source_languaje]["translator"]
     with open(transcription_file, "r") as f:
         transcription = f.read().splitlines()
+    # Concatenate transcriptions
+    raw_transcription = ""
+    progress_bar = tqdm(total=len(transcription), desc='Concatenate transcriptions progress')
     for line in transcription:
         if re.match(r"\d+$", line):
+            pass
         elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
+            pass
         elif re.match(r"^$", line):
+            pass
         else:
+            line = re.sub(r"\[SPEAKER_\d\d\]:", MAGIC_STRING, line)
+            raw_transcription += f"{line} "
         progress_bar.update(1)
+    progress_bar.close()
+    # Save raw transcription
+    if DEBUG:
+        output_file = f"{output_folder}/{transcription_file_name}_raw.srt"
+        with open(output_file, "w") as f:
+            f.write(raw_transcription)
+    # Split raw transcription
+    raw_transcription_list = raw_transcription.split(MAGIC_STRING)
+    if raw_transcription_list[0] == "":
+        raw_transcription_list = raw_transcription_list[1:]
+    # Concatenate transcripts and translate when length is less than MAX_LENGTH
+    translated_transcription = ""
+    concatenate_transcription = raw_transcription_list[0] + MAGIC_STRING
+    progress_bar = tqdm(total=len(raw_transcription_list), desc='Translate transcriptions progress')
+    progress_bar.update(1)
+    if len(raw_transcription_list) > 1:
+        for transcription in raw_transcription_list[1:]:
+            if len(concatenate_transcription) + len(transcription) < MAX_LENGTH:
+                concatenate_transcription += transcription + MAGIC_STRING
+            else:
+                translation = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
+                translated_transcription += translation
+                concatenate_transcription = transcription + MAGIC_STRING
+            progress_bar.update(1)
+        # Translate last part
+        translation = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
+        translated_transcription += translation
+    else:
+        translated_transcription = translate(concatenate_transcription, source_languaje, target_languaje, translate_model, translate_tokenizer, device)
+    progress_bar.close()
+    # Save translated transcription raw
+    if DEBUG:
+        output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}_raw.srt"
+        with open(output_file, "w") as f:
+            f.write(translated_transcription)
+    # Read transcription
+    with open(transcription_file, "r") as f:
+        transcription = f.read().splitlines()
+    # Add time stamps
+    translated_transcription_time_stamps = ""
+    translated_transcription_list = translated_transcription.split(MAGIC_STRING)
+    progress_bar = tqdm(total=len(translated_transcription_list), desc='Add time stamps to translated transcriptions progress')
+    i = 0
+    for line in transcription:
+        if re.match(r"\d+$", line):
+            translated_transcription_time_stamps += f"{line}\n"
+        elif re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
+            translated_transcription_time_stamps += f"{line}\n"
+        elif re.match(r"^$", line):
+            translated_transcription_time_stamps += f"{line}\n"
+        else:
+            if (i < len(translated_transcription_list)):
+                if translated_transcription_list[i][0] == " ": # Remove space at the beginning
+                    translated_transcription_list[i] = translated_transcription_list[i][1:]
+                speaker = ""
+                if re.match(r"\[SPEAKER_\d\d\]:", line):
+                    speaker = re.match(r"\[SPEAKER_\d\d\]:", line).group(0)
+                translated_transcription_time_stamps += f"{speaker} {translated_transcription_list[i]}\n"
+                i += 1
+                progress_bar.update(1)
+    progress_bar.close()
+    # Save translated transcription
     output_file = f"{output_folder}/{transcription_file_name}_{target_languaje}.srt"
     with open(output_file, "w") as f:
+        f.write(translated_transcription_time_stamps)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()