Spaces:
Runtime error
Runtime error
import torch | |
import psutil | |
from pytube import YouTube | |
import time | |
import re | |
import pandas as pd | |
import pysrt | |
from pathlib import Path | |
import gradio as gr | |
import os | |
import requests | |
import json | |
import base64 | |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git') | |
os.system('make -C ./whisper.cpp') | |
os.system('wget https://huggingface.co/datasets/tensorops/ggml-whisper-medium-th-combined/resolve/main/ggml-whisper-medium-th-combined.bin') | |
num_cores = psutil.cpu_count() | |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}" | |
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print("DEVICE IS: ") | |
print(device) | |
videos_out_path = Path("./videos_out") | |
videos_out_path.mkdir(parents=True, exist_ok=True) | |
def get_youtube(video_url): | |
yt = YouTube(video_url) | |
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by( | |
'resolution').desc().first().download() | |
return abs_video_path | |
def speech_to_text(video_file_path): | |
""" | |
# Youtube with translated subtitles using OpenAI Whisper models. | |
# Currently supports only Thai audio | |
This space allows you to: | |
1. Download youtube video with a given url | |
2. Watch it in the first video component | |
3. Run automatic speech recognition on the video using fast Whisper models | |
4. Burn the transcriptions to the original video and watch the video in the 2nd video component | |
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper | |
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp | |
""" | |
if (video_file_path == None): | |
raise ValueError("Error no video input") | |
print(video_file_path) | |
try: | |
_, file_ending = os.path.splitext(f'{video_file_path}') | |
print(f'file enging is {file_ending}') | |
print("starting conversion to wav") | |
os.system( | |
f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"') | |
print("conversion to wav ready") | |
print("starting whisper c++") | |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | |
os.system(f'rm -f {srt_path}') | |
os.system( | |
f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l "th" -m ./ggml-whisper-medium-th-combined.bin -osrt') | |
print("starting whisper done with whisper") | |
except Exception as e: | |
raise RuntimeError("Error converting video to audio") | |
try: | |
df = pd.DataFrame(columns=['start', 'end', 'text']) | |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | |
subs = pysrt.open(srt_path) | |
objects = [] | |
for sub in subs: | |
start_hours = str(str(sub.start.hours) + "00")[0:2] if len( | |
str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2] | |
end_hours = str(str(sub.end.hours) + "00")[0:2] if len( | |
str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2] | |
start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len( | |
str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2] | |
end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len( | |
str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2] | |
start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len( | |
str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2] | |
end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len( | |
str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2] | |
start_millis = str(str(sub.start.milliseconds) + "000")[0:3] | |
end_millis = str(str(sub.end.milliseconds) + "000")[0:3] | |
objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', | |
f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}']) | |
for object in objects: | |
srt_to_df = { | |
'start': [object[1]], | |
'end': [object[2]], | |
'text': [object[0]] | |
} | |
df = pd.concat([df, pd.DataFrame(srt_to_df)]) | |
df.to_csv('subtitles.csv', index=False) | |
print("Starting SRT-file creation") | |
df.reset_index(inplace=True) | |
with open('subtitles.vtt', 'w', encoding="utf-8") as file: | |
print("Starting WEBVTT-file creation") | |
for i in range(len(df)): | |
if i == 0: | |
file.write('WEBVTT') | |
file.write('\n') | |
else: | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
file.write(f"{start.strip()}") | |
stop = df.iloc[i]['end'] | |
file.write(' --> ') | |
file.write(f"{stop}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['text']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("WEBVTT DONE") | |
with open('subtitles.srt', 'w', encoding="utf-8") as file: | |
print("Starting SRT-file creation") | |
for i in range(len(df)): | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
file.write(f"{start.strip()}") | |
stop = df.iloc[i]['end'] | |
file.write(' --> ') | |
file.write(f"{stop}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['text']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("SRT DONE") | |
subtitle_files = ['subtitles.vtt', 'subtitles.srt', 'subtitles.csv'] | |
return df, subtitle_files | |
except Exception as e: | |
raise RuntimeError("Error Running inference with local model", e) | |
def burn_srt_to_video(srt_file, video_in): | |
print("Starting creation of video wit srt") | |
try: | |
video_out = video_in.replace('.mp4', '_out.mp4') | |
print(os.system('ls -lrth')) | |
print(video_in) | |
print(video_out) | |
command = 'ffmpeg -i "{}" -y -vf subtitles=./subtitles.srt "{}"'.format( | |
video_in, video_out) | |
os.system(command) | |
return video_out | |
except Exception as e: | |
print(e) | |
return video_out | |
def create_video_player(subtitle_files, video_in): | |
with open(video_in, "rb") as file: | |
video_base64 = base64.b64encode(file.read()) | |
with open('./subtitles.vtt', "rb") as file: | |
subtitle_base64 = base64.b64encode(file.read()) | |
video_player = f'''<video id="video" controls preload="metadata"> | |
<source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" /> | |
<track | |
label="Thai" | |
kind="subtitles" | |
srclang="th" | |
src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}" | |
default /> | |
</video> | |
''' | |
return video_player | |
# ---- Gradio Layout ----- | |
video_in = gr.Video(label="Video file", mirror_webcam=False) | |
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) | |
video_out = gr.Video(label="Video Out", mirror_webcam=False) | |
df_init = pd.DataFrame(columns=['start', 'end', 'text', 'translation']) | |
transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=( | |
0, "dynamic"), max_rows=10, wrap=True, overflow_row_behaviour='paginate') | |
transcription_and_translation_df = gr.DataFrame( | |
value=df_init, label="Transcription and translation dataframe", max_rows=10, wrap=True, overflow_row_behaviour='paginate') | |
subtitle_files = gr.File( | |
label="Download srt-file", | |
file_count="multiple", | |
type="file", | |
interactive=False, | |
) | |
video_player = gr.HTML( | |
'<p>video will be played here after you press the button at step 3') | |
demo = gr.Blocks(css=''' | |
#cut_btn, #reset_btn { align-self:stretch; } | |
#\\31 3 { max-width: 540px; } | |
.output-markdown {max-width: 65ch !important;} | |
''') | |
demo.encrypt = False | |
with demo: | |
transcription_var = gr.Variable() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
### This space allows you to: | |
##### 1. Download youtube video with a given URL | |
##### 2. Watch it in the first video component | |
##### 3. Run automatic Thai speech recognition on the video using Whisper | |
##### 4. Burn the translations to the original video and watch the video in the 2nd video component | |
''') | |
with gr.Column(): | |
gr.Markdown(''' | |
### 1. Insert Youtube URL below. Some test videos below: | |
##### 1. https://www.youtube.com/watch?v=UIHPIESyIXM | |
##### 2. https://www.youtube.com/watch?v=YlfaFK7OFUo | |
''') | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_in.render() | |
download_youtube_btn = gr.Button("Step 1. Download Youtube video") | |
download_youtube_btn.click(get_youtube, [youtube_url_in], [ | |
video_in]) | |
print(video_in) | |
with gr.Row(): | |
with gr.Column(): | |
video_in.render() | |
with gr.Column(): | |
gr.Markdown(''' | |
##### Here you can start the transcription process. | |
##### Be aware that processing will take some time. | |
''') | |
transcribe_btn = gr.Button("Step 2. Transcribe audio") | |
transcribe_btn.click(speech_to_text, [ | |
video_in], [transcription_df, subtitle_files]) | |
with gr.Row(): | |
gr.Markdown(''' | |
##### Here you will get transcription output | |
##### ''') | |
with gr.Row(): | |
with gr.Column(): | |
transcription_df.render() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown( | |
'''##### From here, you can download the transcription output in different formats. ''') | |
subtitle_files.render() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
##### Now press the Step 3. Button to create output video with translated transcriptions | |
##### ''') | |
create_video_button = gr.Button( | |
"Step 3. Create and add subtitles to video") | |
print(video_in) | |
create_video_button.click(create_video_player, [subtitle_files, video_in], [ | |
video_player]) | |
video_player.render() | |
demo.launch() | |