Spaces:
Paused
Paused
import spaces | |
import os | |
import gradio as gr | |
import torch | |
import torchaudio | |
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor | |
from pytube import YouTube | |
from transformers import pipeline | |
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) | |
def transcribe_speech(audio): | |
if audio is None: # Handle the NoneType error for microphone input | |
return "No audio received." | |
return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word') | |
def transcribe_from_youtube(url): | |
# Download audio from YouTube using pytube | |
yt = YouTube(url) | |
audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") | |
# Transcribe the downloaded audio | |
transcription = transcribe_speech(audio_path) | |
# Clean up the downloaded file | |
os.remove(audio_path) | |
return transcription | |
def populate_metadata(url): | |
yt = YouTube(url) | |
return yt.thumbnail_url, yt.title | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 500px; margin: 0 auto;"> | |
<div> | |
<h1>Youtube Speech Transcription</h1> | |
</div> | |
<p style="margin-bottom: 10px; font-size: 94%"> | |
Speech to text transcription of Youtube videos using Wav2Vec2-BERT | |
</p> | |
</div> | |
""" | |
) | |
with gr.Tab("Microphone Input"): | |
gr.Markdown("## Transcribe speech from microphone") | |
mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone") | |
transcribe_button = gr.Button("Transcribe") | |
transcription_output = gr.Textbox(label="Transcription") | |
transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output) | |
with gr.Tab("YouTube URL"): | |
gr.Markdown("## Transcribe speech from YouTube video") | |
youtube_url = gr.Textbox(label="Enter YouTube video URL") | |
title = gr.Label(label="Video Title") | |
img = gr.Image(label="Thumbnail", height=120, width=120) | |
transcribe_button = gr.Button("Transcribe") | |
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10) | |
transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output) | |
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title]) | |
demo.launch() |