Spaces:
Runtime error
Runtime error
import os | |
# Download and build ggergavos/whisper.cpp Kudos to this man for wonderful whisper implementation! | |
# This means speed! | |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git') | |
os.system('make -C ./whisper.cpp') | |
# Download models, add finetuned languages later once whisper finetuning event is ready | |
# Models are downloaded on the fly so we can get quite many models :) | |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh small') | |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base') | |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium') | |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en') | |
#os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-base.en.bin -f whisper.cpp/samples/jfk.wav') | |
#print("SEURAAVAKSI SMALL TESTI") | |
#os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-small.bin -f whisper.cpp/samples/jfk.wav') | |
#print("MOI") | |
import gradio as gr | |
from pathlib import Path | |
import pysrt | |
import pandas as pd | |
import re | |
import time | |
import os | |
import json | |
import requests | |
from pytube import YouTube | |
from transformers import MarianMTModel, MarianTokenizer | |
import psutil | |
num_cores = psutil.cpu_count() | |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}" | |
headers = {'Authorization': os.environ['DeepL_API_KEY']} | |
whisper_models = ["base", "small", "medium", "base.en"] | |
source_languages = { | |
"Afrikaans":"af", | |
"Amharic":"am", | |
"Arabic":"ar", | |
"Asturian ":"st", | |
"Azerbaijani":"az", | |
"Bashkir":"ba", | |
"Belarusian":"be", | |
"Bulgarian":"bg", | |
"Bengali":"bn", | |
"Breton":"br", | |
"Bosnian":"bs", | |
"Catalan; Valencian":"ca", | |
"Cebuano":"eb", | |
"Czech":"cs", | |
"Welsh":"cy", | |
"Danish":"da", | |
"German":"de", | |
"Greeek":"el", | |
"English":"en", | |
"Spanish":"es", | |
"Estonian":"et", | |
"Persian":"fa", | |
"Fulah":"ff", | |
"Finnish":"fi", | |
"French":"fr", | |
"Western Frisian":"fy", | |
"Irish":"ga", | |
"Gaelic; Scottish Gaelic":"gd", | |
"Galician":"gl", | |
"Gujarati":"gu", | |
"Hausa":"ha", | |
"Hebrew":"he", | |
"Hindi":"hi", | |
"Croatian":"hr", | |
"Haitian; Haitian Creole":"ht", | |
"Hungarian":"hu", | |
"Armenian":"hy", | |
"Indonesian":"id", | |
"Igbo":"ig", | |
"Iloko":"lo", | |
"Icelandic":"is", | |
"Italian":"it", | |
"Japanese":"ja", | |
"Javanese":"jv", | |
"Georgian":"ka", | |
"Kazakh":"kk", | |
"Central Khmer":"km", | |
"Kannada":"kn", | |
"Korean":"ko", | |
"Luxembourgish; Letzeburgesch":"lb", | |
"Ganda":"lg", | |
"Lingala":"ln", | |
"Lao":"lo", | |
"Lithuanian":"lt", | |
"Latvian":"lv", | |
"Malagasy":"mg", | |
"Macedonian":"mk", | |
"Malayalam":"ml", | |
"Mongolian":"mn", | |
"Marathi":"mr", | |
"Malay":"ms", | |
"Burmese":"my", | |
"Nepali":"ne", | |
"Dutch; Flemish":"nl", | |
"Norwegian":"no", | |
"Northern Sotho":"ns", | |
"Occitan (post 1500)":"oc", | |
"Oriya":"or", | |
"Panjabi; Punjabi":"pa", | |
"Polish":"pl", | |
"Pushto; Pashto":"ps", | |
"Portuguese":"pt", | |
"Romanian; Moldavian; Moldovan":"ro", | |
"Russian":"ru", | |
"Sindhi":"sd", | |
"Sinhala; Sinhalese":"si", | |
"Slovak":"sk", | |
"Slovenian":"sl", | |
"Somali":"so", | |
"Albanian":"sq", | |
"Serbian":"sr", | |
"Swati":"ss", | |
"Sundanese":"su", | |
"Swedish":"sv", | |
"Swahili":"sw", | |
"Tamil":"ta", | |
"Thai":"th", | |
"Tagalog":"tl", | |
"Tswana":"tn", | |
"Turkish":"tr", | |
"Ukrainian":"uk", | |
"Urdu":"ur", | |
"Uzbek":"uz", | |
"Vietnamese":"vi", | |
"Wolof":"wo", | |
"Xhosa":"xh", | |
"Yiddish":"yi", | |
"Yoruba":"yo", | |
"Chinese":"zh", | |
"Zulu":"zu", | |
"Let the model analyze": "Let the model analyze" | |
} | |
DeepL_language_codes_for_translation = { | |
"Bulgarian": "BG", | |
"Czech": "CS", | |
"Danish": "DA", | |
"German": "DE", | |
"Greek": "EL", | |
"English": "EN", | |
"Spanish": "ES", | |
"Estonian": "ET", | |
"Finnish": "FI", | |
"French": "FR", | |
"Hungarian": "HU", | |
"Indonesian": "ID", | |
"Italian": "IT", | |
"Japanese": "JA", | |
"Lithuanian": "LT", | |
"Latvian": "LV", | |
"Dutch": "NL", | |
"Polish": "PL", | |
"Portuguese": "PT", | |
"Romanian": "RO", | |
"Russian": "RU", | |
"Slovak": "SK", | |
"Slovenian": "SL", | |
"Swedish": "SV", | |
"Turkish": "TR", | |
"Ukrainian": "UK", | |
"Chinese": "ZH" | |
} | |
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) | |
source_language_list = [key[0] for key in source_languages.items()] | |
translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()] | |
videos_out_path = Path("./videos_out") | |
videos_out_path.mkdir(parents=True, exist_ok=True) | |
def get_youtube(video_url): | |
yt = YouTube(video_url) | |
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() | |
print("LADATATTU POLKUUN") | |
print(abs_video_path) | |
return abs_video_path | |
def speech_to_text(video_file_path, selected_source_lang, whisper_model): | |
""" | |
# Youtube with translated subtitles using OpenAI Whisper and Opus-MT models. | |
# Currently supports only English audio | |
This space allows you to: | |
1. Download youtube video with a given url | |
2. Watch it in the first video component | |
3. Run automatic speech recognition on the video using fast Whisper models | |
4. Translate the recognized transcriptions to 26 languages supported by deepL | |
5. Burn the translations to the original video and watch the video in the 2nd video component | |
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper | |
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp | |
""" | |
if(video_file_path == None): | |
raise ValueError("Error no video input") | |
print(video_file_path) | |
try: | |
_,file_ending = os.path.splitext(f'{video_file_path}') | |
print(f'file enging is {file_ending}') | |
print("starting conversion to wav") | |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"') | |
print("conversion to wav ready") | |
print("starting whisper c++") | |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | |
os.system(f'rm -f {srt_path}') | |
if selected_source_lang == "Let the model analyze": | |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt') | |
else: | |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt') | |
print("starting whisper done with whisper") | |
except Exception as e: | |
raise RuntimeError("Error converting video to audio") | |
try: | |
df = pd.DataFrame(columns = ['start','end','text']) | |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | |
subs = pysrt.open(srt_path) | |
objects = [] | |
for sub in subs: | |
start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2] | |
end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2] | |
start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2] | |
end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2] | |
start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2] | |
end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2] | |
start_millis = str(str(sub.start.milliseconds) + "000")[0:3] | |
end_millis = str(str(sub.end.milliseconds) + "000")[0:3] | |
objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}']) | |
for object in objects: | |
srt_to_df = { | |
'start': [object[1]], | |
'end': [object[2]], | |
'text': [object[0]] | |
} | |
df = pd.concat([df, pd.DataFrame(srt_to_df)]) | |
return df | |
except Exception as e: | |
raise RuntimeError("Error Running inference with local model", e) | |
def translate_transcriptions(df, selected_translation_lang_2): | |
if selected_translation_lang_2 is None: | |
selected_translation_lang_2 = 'English' | |
df.reset_index(inplace=True) | |
print("start_translation") | |
translations = [] | |
text_combined = "" | |
for i, sentence in enumerate(df['text']): | |
if i == 0: | |
text_combined = sentence | |
else: | |
text_combined = text_combined + '\n' + sentence | |
data = {'text': text_combined, | |
'tag_spitting': 'xml', | |
'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2) | |
} | |
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data) | |
# Print the response from the server | |
translated_sentences = json.loads(response.text) | |
translated_sentences = translated_sentences['translations'][0]['text'].split('\n') | |
df['translation'] = translated_sentences | |
print("translations done") | |
return df | |
def create_srt_and_burn(df, video_in): | |
print("Starting creation of video wit srt") | |
print("video in path is:") | |
print(video_in) | |
with open('testi.srt','w', encoding="utf-8") as file: | |
for i in range(len(df)): | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
file.write(f"{start}") | |
stop = df.iloc[i]['end'] | |
file.write(' --> ') | |
file.write(f"{stop}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['translation']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("SRT DONE") | |
try: | |
file1 = open('./testi.srt', 'r', encoding="utf-8") | |
Lines = file1.readlines() | |
count = 0 | |
# Strips the newline character | |
for line in Lines: | |
count += 1 | |
print("{}".format(line)) | |
print(type(video_in)) | |
print(video_in) | |
video_out = video_in.replace('.mp4', '_out.mp4') | |
print("video_out_path") | |
print(video_out) | |
command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out) | |
print(command) | |
os.system(command) | |
return video_out | |
except Exception as e: | |
print(e) | |
return video_out | |
# ---- Gradio Layout ----- | |
video_in = gr.Video(label="Video file", mirror_webcam=False) | |
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) | |
video_out = gr.Video(label="Video Out", mirror_webcam=False) | |
df_init = pd.DataFrame(columns=['start','end','text']) | |
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True) | |
selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True) | |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True) | |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate') | |
transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate') | |
demo = gr.Blocks(css=''' | |
#cut_btn, #reset_btn { align-self:stretch; } | |
#\\31 3 { max-width: 540px; } | |
.output-markdown {max-width: 65ch !important;} | |
''') | |
demo.encrypt = False | |
with demo: | |
transcription_var = gr.Variable() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
### This space allows you to: | |
##### 1. Download youtube video with a given URL | |
##### 2. Watch it in the first video component | |
##### 3. Run automatic speech recognition on the video using Whisper | |
##### 4. Translate the recognized transcriptions to 26 languages supported by deepL | |
##### 5. Burn the translations to the original video and watch the video in the 2nd video component | |
''') | |
with gr.Column(): | |
gr.Markdown(''' | |
### 1. Insert Youtube URL below. Some test videos below: | |
##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24 | |
##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren | |
##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision | |
''') | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_in.render() | |
download_youtube_btn = gr.Button("Step 1. Download Youtube video") | |
download_youtube_btn.click(get_youtube, [youtube_url_in], [ | |
video_in]) | |
print(video_in) | |
with gr.Row(): | |
with gr.Column(): | |
video_in.render() | |
with gr.Column(): | |
gr.Markdown(''' | |
##### Here you can start the transcription and translation process. | |
##### Be aware that processing will last some time. With base model it is around 3x speed | |
##### Please select source language for better transcriptions. Using 'Let the model analyze' makes mistakes sometimes and may lead to bad transcriptions | |
''') | |
selected_source_lang.render() | |
selected_whisper_model.render() | |
transcribe_btn = gr.Button("Step 2. Transcribe audio") | |
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df) | |
with gr.Row(): | |
gr.Markdown(''' | |
##### Here you will get transcription output | |
##### ''') | |
with gr.Row(): | |
with gr.Column(): | |
transcription_df.render() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
##### PLEASE READ BELOW | |
##### Here you will can translate transcriptions to 26 languages. | |
##### If spoken language is not in the list, translation might not work. In this case original transcriptions are used | |
##### ''') | |
selected_translation_lang_2.render() | |
translate_transcriptions_button = gr.Button("Step 3. Translate transcription") | |
translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df) | |
transcription_and_translation_df.render() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
##### Now press the Step 4. Button to create output video with translated transcriptions | |
##### ''') | |
translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video") | |
print(video_in) | |
translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [ | |
video_out]) | |
video_out.render() | |
demo.launch() |