|
|
|
import os |
|
os.system('git clone https://github.com/ggerganov/whisper.cpp.git') |
|
os.system('make -C ./whisper.cpp') |
|
MODELS_TO_DOWNLOAD = ['small', 'base', 'tiny'] |
|
|
|
for model_name in MODELS_TO_DOWNLOAD: |
|
os.system(f'bash ./whisper.cpp/models/download-ggml-model.sh {model_name}') |
|
|
|
|
|
|
|
import os |
|
import requests |
|
import json |
|
import base64 |
|
|
|
import gradio as gr |
|
from pathlib import Path |
|
import pysrt |
|
import pandas as pd |
|
import re |
|
import time |
|
|
|
from pytube import YouTube |
|
import torch |
|
|
|
INTRO_MSG = ''' |
|
#### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials, |
|
especially well dubbed videos (target language video with target language subs). |
|
This tool will hopefully transcribe and add subs to your videos. |
|
At least for me this is a nice tool to practice both listening and reading skills. |
|
<p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper |
|
<p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp |
|
''' |
|
|
|
whisper_models = MODELS_TO_DOWNLOAD |
|
|
|
custom_models = [] |
|
combined_models = [] |
|
combined_models.extend(whisper_models) |
|
combined_models.extend(custom_models) |
|
|
|
LANGUAGES = { |
|
"bg": "Bulgarian", |
|
} |
|
|
|
|
|
source_languages = { |
|
**{language: code for code, language in LANGUAGES.items()} |
|
} |
|
|
|
source_language_list = [key[0] for key in source_languages.items()] |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"DEVICE IS: {device}") |
|
|
|
def get_youtube(video_url): |
|
yt = YouTube(video_url) |
|
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() |
|
print(f"Download complete - {abs_video_path}") |
|
return abs_video_path |
|
|
|
def speech_to_text(video_file_path, selected_source_lang, whisper_model): |
|
""" |
|
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper |
|
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp |
|
""" |
|
|
|
if(video_file_path == None): |
|
raise ValueError("Error no video input") |
|
|
|
print(video_file_path) |
|
_,file_ending = os.path.splitext(f'{video_file_path}') |
|
input_wav_file = video_file_path.replace(file_ending, ".wav") |
|
srt_path = input_wav_file + ".srt" |
|
vtt_path = input_wav_file + ".vtt" |
|
try: |
|
print(f'file enging is {file_ending}, starting conversion to wav') |
|
subs_paths = video_file_path.replace(file_ending, ".wav") |
|
|
|
if os.path.exists(subs_paths): |
|
os.remove(subs_paths) |
|
|
|
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{subs_paths}"') |
|
print("conversion to wav ready") |
|
|
|
except Exception as e: |
|
raise RuntimeError("Error Running inference with local model", e) |
|
|
|
try: |
|
print("starting whisper c++") |
|
os.system(f'rm -f {srt_path}') |
|
print('Running regular model') |
|
os.system(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt') |
|
print("whisper c++ finished") |
|
except Exception as e: |
|
raise RuntimeError("Error running Whisper cpp model") |
|
|
|
print(f'Subtitles path {srt_path}, {vtt_path}') |
|
return [vtt_path, srt_path] |
|
|
|
def create_video_player(subs_files, video_in): |
|
print(f"create_video_player - {subs_files}, {video_in}") |
|
|
|
with open(subs_files[0], "rb") as file: |
|
subtitle_base64 = base64.b64encode(file.read()) |
|
|
|
with open(video_in, "rb") as file: |
|
video_base64 = base64.b64encode(file.read()) |
|
|
|
video_player = f'''<video id="video" controls preload="metadata"> |
|
<source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" /> |
|
<track |
|
label="English" |
|
kind="subtitles" |
|
srclang="en" |
|
src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}" |
|
default /> |
|
</video> |
|
''' |
|
|
|
print('create_video_player - Done') |
|
return video_player |
|
|
|
|
|
|
|
video_in = gr.Video(label="Video file", mirror_webcam=False) |
|
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) |
|
video_out = gr.Video(label="Video Out", mirror_webcam=False) |
|
|
|
selected_source_lang = gr.Dropdown(choices=source_language_list, |
|
type="value", |
|
value= source_language_list[0], |
|
label="Spoken language in video", |
|
interactive=True) |
|
selected_whisper_model = gr.Dropdown(choices=whisper_models, |
|
type="value", |
|
value=whisper_models[0], |
|
label="Selected Whisper model", |
|
interactive=True) |
|
|
|
subtitle_files = gr.File( |
|
label="Download subtitles", |
|
file_count="multiple", |
|
type="file", |
|
interactive=False, |
|
) |
|
|
|
video_player = gr.HTML('<p>video will be played here') |
|
eventslider = gr.Slider(visible=False) |
|
status_msg = gr.Markdown('Status') |
|
|
|
demo = gr.Blocks() |
|
demo.encrypt = False |
|
|
|
def set_app_msg(app_state, msg): |
|
app_state['status_msg'] = msg |
|
|
|
def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model): |
|
set_app_msg(app_state, 'Downloading the movie ...') |
|
video_file_path = get_youtube(youtube_url_in) |
|
set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.') |
|
subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model) |
|
set_app_msg(app_state, f'Creating the video player ...') |
|
video_player = create_video_player(subtitle_files, video_file_path) |
|
set_app_msg(app_state, f'Transcribing done, generating video player ...') |
|
return subtitle_files, video_player |
|
|
|
|
|
def on_change_event(app_state): |
|
print('Running!') |
|
return app_state['status_msg'] |
|
|
|
with demo: |
|
app_state = gr.State({ |
|
'running':False, |
|
'status_msg': '' |
|
}) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(INTRO_MSG) |
|
gr.Markdown('''### Copy any non-private Youtube video URL to box below or click one of the examples.''') |
|
examples = gr.Examples(examples=["https://www.youtube.com/watch?v=UjAn3Pza3qo", "https://www.youtube.com/watch?v=oOZivhYfPD4"], |
|
label="Examples", inputs=[youtube_url_in]) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
youtube_url_in.render() |
|
selected_source_lang.render() |
|
selected_whisper_model.render() |
|
|
|
download_youtube_btn = gr.Button("Transcribe the video") |
|
download_youtube_btn.click(transcribe, [app_state, youtube_url_in, selected_source_lang, selected_whisper_model], [subtitle_files, video_player]) |
|
|
|
eventslider.render() |
|
status_msg.render() |
|
subtitle_files.render() |
|
video_player.render() |
|
with gr.Row(): |
|
gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.') |
|
|
|
dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg], every=10) |
|
|
|
|
|
|
|
is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') |
|
print(is_kaggle) |
|
|
|
if is_kaggle: |
|
demo.queue().launch(share=True, debug=True) |
|
else: |
|
demo.queue().launch() |
|
|