|
from transformers import pipeline |
|
import gradio as gr |
|
import os |
|
import pytube as pt |
|
import subprocess |
|
import sys |
|
|
|
youtube_file_path = "youtube_audio" |
|
|
|
pipe = pipeline(model="ArtificialCoincidence/check_points") |
|
|
|
def transcribe(audio): |
|
text = pipe(audio)["text"] |
|
return text |
|
|
|
def _return_yt_html_embed(yt_url): |
|
video_id = yt_url.split("?v=")[-1] |
|
HTML_str = ( |
|
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>' |
|
" </center>" |
|
) |
|
return HTML_str |
|
|
|
def transcribe_youtube(yt_url): |
|
yt = pt.YouTube(yt_url) |
|
html_embed_str = _return_yt_html_embed(yt_url) |
|
stream = yt.streams.filter(only_audio=True)[0] |
|
stream.download(filename=youtube_file_path) |
|
text = transcribe(youtube_file_path) |
|
return text |
|
|
|
def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"): |
|
filename, ext = os.path.splitext(video_file) |
|
subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"], |
|
stdout=subprocess.DEVNULL, |
|
stderr=subprocess.STDOUT) |
|
return f"{filename}.{output_ext}" |
|
|
|
def transcribe_video(video): |
|
audio_filename = convert_video_to_audio_ffmpeg(video) |
|
text = transcribe(audio_filename) |
|
return text |
|
|
|
iface = gr.Blocks() |
|
|
|
microphone_trans = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(sources="microphone", type="filepath"), |
|
outputs="text", |
|
title="Whisper Chinese", |
|
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.", |
|
) |
|
|
|
youtube_trans = gr.Interface( |
|
fn=transcribe_youtube, |
|
inputs=gr.Textbox(lines=1, placeholder="Paste the URL to a video here", label="video url"), |
|
outputs="text", |
|
title="Whisper Chinese", |
|
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.", |
|
) |
|
|
|
video_trans = gr.Interface( |
|
fn=transcribe_video, |
|
inputs=gr.Video(sources="upload"), |
|
outputs="text", |
|
title="Whisper Chinese", |
|
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.", |
|
) |
|
|
|
with iface: |
|
gr.TabbedInterface([microphone_trans, youtube_trans, video_trans], ["Transcribe Microphone", "Transcribe Youtube Video", "Transcribe Video"]) |
|
iface.launch(debug=True) |