Spaces:
Build error
Build error
import gradio as gr | |
import os | |
from pathlib import Path | |
import time | |
import pandas as pd | |
import re | |
import time | |
import os | |
import whisper | |
from pytube import YouTube | |
import psutil | |
num_cores = psutil.cpu_count() | |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}" | |
import torch | |
# is cuda available? | |
from easynmt import EasyNMT | |
translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60, max_length=60) | |
asr_model = whisper.load_model("base") | |
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False, language="Spanish") | |
translation_models = { | |
"Finnish": "fi", | |
"Swedish": "sv", | |
"Danish": "da", | |
"English": "en", | |
"German": "de" | |
} | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print("DEVICE IS: ") | |
print(device) | |
videos_out_path = Path("./videos_out") | |
videos_out_path.mkdir(parents=True, exist_ok=True) | |
def get_youtube(video_url): | |
yt = YouTube(video_url) | |
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() | |
print("LADATATTU POLKUUN") | |
print(abs_video_path) | |
return abs_video_path | |
async def speech_to_text(video_file_path, selected_translation_lang): | |
""" | |
# Youtube with translated subtitles using OpenAI Whisper and Opus-MT models. | |
# Currently supports only English audio | |
This space allows you to: | |
1. Download youtube video with a given url | |
2. Watch it in the first video component | |
3. Run automatic speech recognition on the video using Whisper | |
4. Translate the recognized transcriptions to Finnish, Swedish, Danish, English, German (More languages coming later) | |
5. Burn the translations to the original video and watch the video in the 2nd video component | |
Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper | |
""" | |
if(video_file_path == None): | |
raise ValueError("Error no video input") | |
print(video_file_path) | |
try: | |
audio = whisper.load_audio(video_file_path) | |
except Exception as e: | |
raise RuntimeError("Error converting video to audio") | |
last_time = time.time() | |
try: | |
print(f'Transcribing via local model') | |
transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False) | |
transcription = asr_model.transcribe(audio, **transcribe_options) | |
#translation_options = dict(language=selected_translation_lang, beam_size=5, best_of=5, without_timestamps=False) | |
#translations = asr_model.transcribe(audio, **translation_options) | |
df = pd.DataFrame(columns=['start','end','text']) | |
for i,segment in enumerate(transcription['segments']): | |
new_row = {'start': segment['start'], | |
'end': segment['end'], | |
'text': segment['text'] | |
} | |
df = df.append(new_row, ignore_index=True) | |
if selected_translation_lang is None: | |
selected_translation_lang = 'Finnish' | |
sentences = df['text'] | |
df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang)) | |
print('After translation to target language \n') | |
return (df) | |
except Exception as e: | |
raise RuntimeError("Error Running inference with local model", e) | |
def create_srt_and_burn(df, video_in): | |
print("Starting creation of video wit srt") | |
with open('testi.srt','w', encoding="utf-8") as file: | |
for i in range(len(df)): | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
milliseconds = round(start * 1000.0) | |
hours = milliseconds // 3_600_000 | |
milliseconds -= hours * 3_600_000 | |
minutes = milliseconds // 60_000 | |
milliseconds -= minutes * 60_000 | |
seconds = milliseconds // 1_000 | |
milliseconds -= seconds * 1_000 | |
file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") | |
stop = df.iloc[i]['end'] | |
milliseconds = round(stop * 1000.0) | |
hours = milliseconds // 3_600_000 | |
milliseconds -= hours * 3_600_000 | |
minutes = milliseconds // 60_000 | |
milliseconds -= minutes * 60_000 | |
seconds = milliseconds // 1_000 | |
milliseconds -= seconds * 1_000 | |
file.write(' --> ') | |
file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['translation']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("SRT DONE") | |
try: | |
file1 = open('./testi.srt', 'r', encoding="utf-8") | |
Lines = file1.readlines() | |
count = 0 | |
# Strips the newline character | |
for line in Lines: | |
count += 1 | |
print("{}".format(line)) | |
print(type(video_in)) | |
print(video_in) | |
video_out = video_in.replace('.mp4', '_out.mp4') | |
print(video_out) | |
command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out) | |
print(command) | |
os.system(command) | |
return video_out | |
except Exception as e: | |
print(e) | |
return video_out | |
# ---- Gradio Layout ----- | |
video_in = gr.Video(label="Video file", mirror_webcam=False) | |
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) | |
video_out = gr.Video(label="Video Out", mirror_webcam=False) | |
df_init = pd.DataFrame(columns=['start','end','text','translation']) | |
selected_translation_lang = gr.Dropdown(choices=["English", "German","Finnish","Swedish", "Danish"], type="value", value="English", label="Language to translate transcriptions to", interactive=True) | |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10) | |
demo = gr.Blocks(css=''' | |
#cut_btn, #reset_btn { align-self:stretch; } | |
#\\31 3 { max-width: 540px; } | |
.output-markdown {max-width: 65ch !important;} | |
''') | |
demo.encrypt = False | |
with demo: | |
transcription_var = gr.Variable() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
### This space allows you to: | |
##### 1. Download youtube video with a given URL | |
##### 2. Watch it in the first video component | |
##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language) | |
##### 4. Translate the recognized transcriptions to English, Finnish, Swedish, Danish and German | |
##### 5. Burn the translations to the original video and watch the video in the 2nd video component | |
''') | |
with gr.Column(): | |
gr.Markdown(''' | |
### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests) | |
##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24 | |
##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren | |
##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision | |
''') | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_in.render() | |
download_youtube_btn = gr.Button("Step 1. Download Youtube video") | |
download_youtube_btn.click(get_youtube, [youtube_url_in], [ | |
video_in]) | |
print(video_in) | |
with gr.Row(): | |
with gr.Column(): | |
video_in.render() | |
with gr.Column(): | |
gr.Markdown(''' | |
##### Here you can start the transcription and translation process. | |
##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing) | |
''') | |
transcribe_btn = gr.Button("Step 2. Transcribe and translate audio") | |
transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df) | |
with gr.Row(): | |
with gr.Column(): | |
selected_translation_lang.render() | |
with gr.Row(): | |
gr.Markdown(''' | |
##### Here you will get transcription and translation output | |
##### If you see error please remember to select translation language | |
##### ''') | |
with gr.Row(): | |
with gr.Column(): | |
transcription_df.render() | |
with gr.Row(): | |
with gr.Column(): | |
translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video") | |
print(video_in) | |
translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [ | |
video_out]) | |
video_out.render() | |
if __name__ == "__main__": | |
demo.launch(debug=True) | |