Spaces:

RASMUS
/

Whisper-youtube-crosslingual-subtitles

Running

App Files Files Community

RASMUS commited on Dec 30, 2022

Commit

7e2e27e

•

1 Parent(s): 668f0a9

Update app.py

Browse files

Files changed (1) hide show

app.py +244 -174

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import os
-# Download and build ggergavos/whisper.cpp Kudos to this man for wonderful whisper implementation!
-# This means speed!
 os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
 os.system('make -C ./whisper.cpp')
-# Download models, add finetuned languages later once whisper finetuning event is ready
-# Models are downloaded on the fly so we can get quite many models :)
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
@@ -21,144 +17,149 @@ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
 #print("MOI")
 import gradio as gr
 from pathlib import Path
 import pysrt
 import pandas as pd
 import re
 import time
-import os
-import json
-import requests
 from pytube import YouTube
-from transformers import MarianMTModel, MarianTokenizer
 import psutil
 num_cores = psutil.cpu_count()
 os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
 headers = {'Authorization': os.environ['DeepL_API_KEY']}
 whisper_models = ["base", "small", "medium", "large", "base.en"]
 LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "he": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
 }
 # language code lookup by name, with a few language aliases
 source_languages = {
     **{language: code for code, language in LANGUAGES.items()},
-    "burmese": "my",
-    "valencian": "ca",
-    "flemish": "nl",
-    "haitian": "ht",
-    "letzeburgesch": "lb",
-    "pushto": "ps",
-    "panjabi": "pa",
-    "moldavian": "ro",
-    "moldovan": "ro",
-    "sinhalese": "si",
-    "castilian": "es",
     "Let the model analyze": "Let the model analyze"
 }
@@ -193,12 +194,16 @@ DeepL_language_codes_for_translation = {
 }
 transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
 source_language_list = [key[0] for key in source_languages.items()]
 translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()]
 videos_out_path = Path("./videos_out")
 videos_out_path.mkdir(parents=True, exist_ok=True)
@@ -228,7 +233,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
     This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
     """
-    if(video_file_path == None):
         raise ValueError("Error no video input")
     print(video_file_path)
     try:
@@ -244,9 +249,12 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
         srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
         os.system(f'rm -f {srt_path}')
         if selected_source_lang == "Let the model analyze":
-            os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
         else:
-            os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
         print("starting whisper done with whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
@@ -294,7 +302,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
 def translate_transcriptions(df, selected_translation_lang_2):
     if selected_translation_lang_2 is None:
-            selected_translation_lang_2 = 'english'
     df.reset_index(inplace=True)
     print("start_translation")
@@ -313,35 +321,61 @@ def translate_transcriptions(df, selected_translation_lang_2):
     'tag_spitting': 'xml',
     'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2)
            }
-    response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
-    # Print the response from the server
-    translated_sentences = json.loads(response.text)
-    translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
-    df['translation'] = translated_sentences
     print("translations done")
-    return df
-def create_srt_and_burn(df, video_in):
-    print("Starting creation of video wit srt")
-    print("video in path is:")
-    print(video_in)
-    with open('testi.srt','w', encoding="utf-8") as file:
         for i in range(len(df)):
             file.write(str(i+1))
             file.write('\n')
             start = df.iloc[i]['start']
-            file.write(f"{start}")
             stop = df.iloc[i]['end']
@@ -353,30 +387,50 @@ def create_srt_and_burn(df, video_in):
             if int(i) != len(df)-1:
                 file.write('\n\n')
-    print("SRT DONE")
-    try:
-        file1 = open('./testi.srt', 'r', encoding="utf-8")
-        Lines = file1.readlines()
-        count = 0
-        # Strips the newline character
-        for line in Lines:
-            count += 1
-            print("{}".format(line))
-        print(type(video_in))
-        print(video_in)
-        video_out = video_in.replace('.mp4', '_out.mp4')
-        print("video_out_path")
-        print(video_out)
-        command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
-        print(command)
-        os.system(command)
-        return video_out
-    except Exception as e:
-        print(e)
-        return video_out
 # ---- Gradio Layout -----
@@ -386,7 +440,7 @@ video_out = gr.Video(label="Video Out", mirror_webcam=False)
-df_init = pd.DataFrame(columns=['start','end','text'])
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
 selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
@@ -395,6 +449,15 @@ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 demo = gr.Blocks(css='''
 #cut_btn, #reset_btn { align-self:stretch; }
@@ -464,21 +527,28 @@ with demo:
             ##### Here you will can translate transcriptions to 26 languages.
             ##### If spoken language is not in the list, translation might not work. In this case original transcriptions are used
             ##### ''')
-            selected_translation_lang_2.render()
             translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
-            translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df)
             transcription_and_translation_df.render()
     with gr.Row():
         with gr.Column():
             gr.Markdown('''
             ##### Now press the Step 4. Button to create output video with translated transcriptions
             ##### ''')
-            translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video")
             print(video_in)
-            translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [
-                video_out])
-            video_out.render()
 demo.launch()

 import os
+import requests
+import json
+import base64
 os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
 os.system('make -C ./whisper.cpp')
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
 os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
 #print("MOI")
 import gradio as gr
 from pathlib import Path
 import pysrt
 import pandas as pd
 import re
 import time
 from pytube import YouTube
+#from transformers import MarianMTModel, MarianTokenizer
 import psutil
 num_cores = psutil.cpu_count()
 os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
 headers = {'Authorization': os.environ['DeepL_API_KEY']}
+import torch
 whisper_models = ["base", "small", "medium", "large", "base.en"]
+custom_models = ["belarus-small"]
+combined_models = []
+combined_models.extend(whisper_models)
+combined_models.extend(custom_models)
 LANGUAGES = {
+    "en": "English",
+    "zh": "Chinese",
+    "de": "German",
+    "es": "Spanish",
+    "ru": "Russian",
+    "ko": "Korean",
+    "fr": "French",
+    "ja": "Japanese",
+    "pt": "Portuguese",
+    "tr": "Turkish",
+    "pl": "Polish",
+    "ca": "Catalan",
+    "nl": "Dutch",
+    "ar": "Arabic",
+    "sv": "Swedish",
+    "it": "Italian",
+    "id": "Indonesian",
+    "hi": "Hindi",
+    "fi": "Finnish",
+    "vi": "Vietnamese",
+    "he": "Hebrew",
+    "uk": "Ukrainian",
+    "el": "Greek",
+    "ms": "Malay",
+    "cs": "Czech",
+    "ro": "Romanian",
+    "da": "Danish",
+    "hu": "Hungarian",
+    "ta": "Tamil",
+    "no": "Norwegian",
+    "th": "Thai",
+    "ur": "Urdu",
+    "hr": "Croatian",
+    "bg": "Bulgarian",
+    "lt": "Lithuanian",
+    "la": "Latin",
+    "mi": "Maori",
+    "ml": "Malayalam",
+    "cy": "Welsh",
+    "sk": "Slovak",
+    "te": "Telugu",
+    "fa": "Persian",
+    "lv": "Latvian",
+    "bn": "Bengali",
+    "sr": "Serbian",
+    "az": "Azerbaijani",
+    "sl": "Slovenian",
+    "kn": "Kannada",
+    "et": "Estonian",
+    "mk": "Macedonian",
+    "br": "Breton",
+    "eu": "Basque",
+    "is": "Icelandic",
+    "hy": "Armenian",
+    "ne": "Nepali",
+    "mn": "Mongolian",
+    "bs": "Bosnian",
+    "kk": "Kazakh",
+    "sq": "Albanian",
+    "sw": "Swahili",
+    "gl": "Galician",
+    "mr": "Marathi",
+    "pa": "Punjabi",
+    "si": "Sinhala",
+    "km": "Khmer",
+    "sn": "Shona",
+    "yo": "Yoruba",
+    "so": "Somali",
+    "af": "Afrikaans",
+    "oc": "Occitan",
+    "ka": "Georgian",
+    "be": "Belarusian",
+    "tg": "Tajik",
+    "sd": "Sindhi",
+    "gu": "Gujarati",
+    "am": "Amharic",
+    "yi": "Yiddish",
+    "lo": "Lao",
+    "uz": "Uzbek",
+    "fo": "Faroese",
+    "ht": "Haitian creole",
+    "ps": "Pashto",
+    "tk": "Turkmen",
+    "nn": "Nynorsk",
+    "mt": "Maltese",
+    "sa": "Sanskrit",
+    "lb": "Luxembourgish",
+    "my": "Myanmar",
+    "bo": "Tibetan",
+    "tl": "Tagalog",
+    "mg": "Malagasy",
+    "as": "Assamese",
+    "tt": "Tatar",
+    "haw": "Hawaiian",
+    "ln": "Lingala",
+    "ha": "Hausa",
+    "ba": "Bashkir",
+    "jw": "Javanese",
+    "su": "Sundanese",
 }
 # language code lookup by name, with a few language aliases
 source_languages = {
     **{language: code for code, language in LANGUAGES.items()},
+    "Burmese": "my",
+    "Valencian": "ca",
+    "Flemish": "nl",
+    "Haitian": "ht",
+    "Letzeburgesch": "lb",
+    "Pushto": "ps",
+    "Panjabi": "pa",
+    "Moldavian": "ro",
+    "Moldovan": "ro",
+    "Sinhalese": "si",
+    "Castilian": "es",
     "Let the model analyze": "Let the model analyze"
 }
 }
 transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
 source_language_list = [key[0] for key in source_languages.items()]
 translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE IS: ")
+print(device)
 videos_out_path = Path("./videos_out")
 videos_out_path.mkdir(parents=True, exist_ok=True)
     This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
     """
+if(video_file_path == None):
         raise ValueError("Error no video input")
     print(video_file_path)
     try:
         srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
         os.system(f'rm -f {srt_path}')
         if selected_source_lang == "Let the model analyze":
+            os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l "auto" -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
         else:
+            if whisper_model in custom_models:
+                os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./converted_models/ggml-{whisper_model}.bin -osrt')
+            else:
+                os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
         print("starting whisper done with whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
 def translate_transcriptions(df, selected_translation_lang_2):
     if selected_translation_lang_2 is None:
+            selected_translation_lang_2 = 'English'
     df.reset_index(inplace=True)
     print("start_translation")
     'tag_spitting': 'xml',
     'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2)
            }
+    try:
+        response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
+        # Print the response from the server
+        translated_sentences = json.loads(response.text)
+        translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
+        df['translation'] = translated_sentences
+    except Exception as e:
+        print("EXCEPTION WITH DEEPL API")
+        print(e)
+        df['translation'] = df['text']
     print("translations done")
+    print("Starting SRT-file creation")
+    print(df.head())
+    df.reset_index(inplace=True)
+    with open('subtitles.vtt','w', encoding="utf-8") as file:
+        print("Starting WEBVTT-file creation")
+        for i in range(len(df)):
+            if i == 0:
+                file.write('WEBVTT')
+                file.write('\n')
+            else:
+                file.write(str(i+1))
+                file.write('\n')
+                start = df.iloc[i]['start']
+                file.write(f"{start.strip()}")
+                stop = df.iloc[i]['end']
+                file.write(' --> ')
+                file.write(f"{stop}")
+                file.write('\n')
+                file.writelines(df.iloc[i]['translation'])
+                if int(i) != len(df)-1:
+                    file.write('\n\n')
+    print("WEBVTT DONE")
+    with open('subtitles.srt','w', encoding="utf-8") as file:
+        print("Starting SRT-file creation")
         for i in range(len(df)):
             file.write(str(i+1))
             file.write('\n')
             start = df.iloc[i]['start']
+            file.write(f"{start.strip()}")
             stop = df.iloc[i]['end']
             if int(i) != len(df)-1:
                 file.write('\n\n')
+    print("SRT DONE")
+    subtitle_files = ['subtitles.vtt','subtitles.srt']
+    return df, subtitle_files
+# def burn_srt_to_video(srt_file, video_in):
+#     print("Starting creation of video wit srt")
+#     try:
+#         video_out = video_in.replace('.mp4', '_out.mp4')
+#         print(os.system('ls -lrth'))
+#         print(video_in)
+#         print(video_out)
+#         command = 'ffmpeg -i "{}" -y -vf subtitles=./subtitles.srt "{}"'.format(video_in, video_out)
+#         os.system(command)
+#         return video_out
+#     except Exception as e:
+#         print(e)
+#         return video_out
+def create_video_player(subtitle_files, video_in):
+    with open(video_in, "rb") as file:
+        video_base64 = base64.b64encode(file.read())
+    with open('./subtitles.vtt', "rb") as file:
+        subtitle_base64 = base64.b64encode(file.read())
+    video_player = f'''<video id="video" controls preload="metadata">
+      <source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" />
+      <track
+        label="English"
+        kind="subtitles"
+        srclang="en"
+        src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}"
+        default />
+    </video>
+    '''
+    #video_player = gr.HTML(video_player)
+    return video_player
 # ---- Gradio Layout -----
+df_init = pd.DataFrame(columns=['start','end','text', 'translation'])
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
 selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
+subtitle_files = gr.File(
+                label="Download srt-file",
+                file_count="multiple",
+                type="file",
+                interactive=False,
+            )
+video_player = gr.HTML('<p>video will be played here after you press the button at step 4')
 demo = gr.Blocks(css='''
 #cut_btn, #reset_btn { align-self:stretch; }
             ##### Here you will can translate transcriptions to 26 languages.
             ##### If spoken language is not in the list, translation might not work. In this case original transcriptions are used
             ##### ''')
+          selected_translation_lang_2.render()
             translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
+            translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], [transcription_and_translation_df, subtitle_files])
             transcription_and_translation_df.render()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''##### From here you can download the srt-file ''')
+            subtitle_files.render()
     with gr.Row():
         with gr.Column():
             gr.Markdown('''
             ##### Now press the Step 4. Button to create output video with translated transcriptions
             ##### ''')
+            create_video_button = gr.Button("Step 4. Create and add subtitles to video")
             print(video_in)
+            create_video_button.click(create_video_player, [subtitle_files,video_in], [
+                video_player])
+            video_player.render()
 demo.launch()