whisper_chinese / app.py
ArtificialCoincidence's picture
Update app.py
8f0cdde
from transformers import pipeline
import gradio as gr
import os
import pytube as pt
import subprocess
import sys
youtube_file_path = "youtube_audio"
pipe = pipeline(model="ArtificialCoincidence/check_points") # change to "your-username/the-name-you-picked"
def transcribe(audio):
text = pipe(audio)["text"]
return text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def transcribe_youtube(yt_url):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename=youtube_file_path)
text = transcribe(youtube_file_path)
return text
def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
filename, ext = os.path.splitext(video_file)
subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
return f"{filename}.{output_ext}"
def transcribe_video(video):
audio_filename = convert_video_to_audio_ffmpeg(video)
text = transcribe(audio_filename)
return text
iface = gr.Blocks()
microphone_trans = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
title="Whisper Chinese",
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.",
)
youtube_trans = gr.Interface(
fn=transcribe_youtube,
inputs=gr.Textbox(lines=1, placeholder="Paste the URL to a video here", label="video url"),
outputs="text",
title="Whisper Chinese",
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.",
)
video_trans = gr.Interface(
fn=transcribe_video,
inputs=gr.Video(sources="upload"),
outputs="text",
title="Whisper Chinese",
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.",
)
with iface:
gr.TabbedInterface([microphone_trans, youtube_trans, video_trans], ["Transcribe Microphone", "Transcribe Youtube Video", "Transcribe Video"])
iface.launch(debug=True)