Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

App Files Files Community

w2v-bert-2.0-kbd / app.py

anzorq

Update app.py

eaed2c2 verified 7 months ago

raw

history blame

2.78 kB

	import spaces
	import gradio as gr
	import torch
	import torchaudio
	from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
	import yt_dlp

	model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
	processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	@spaces.GPU
	def transcribe_speech(audio):
	# Load the audio file
	waveform, sr = torchaudio.load(audio)

	# Resample the audio if needed
	resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
	waveform = resampler(waveform)

	# Convert to mono if needed
	if waveform.dim() > 1:
	waveform = torch.mean(waveform, dim=0)

	# Normalize the audio
	waveform = waveform / torch.max(torch.abs(waveform))

	# Extract input features
	input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
	input_features = torch.from_numpy(input_features).to(device)

	# Generate logits using the model
	with torch.no_grad():
	logits = model(input_features).logits

	# Decode the predicted ids to text
	pred_ids = torch.argmax(logits, dim=-1)[0]
	pred_text = processor.decode(pred_ids)

	return pred_text

	@spaces.GPU
	def transcribe_from_youtube(url):
	# Download audio from YouTube using yt-dlp
	audio_path = "downloaded_audio.wav"
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': audio_path,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'wav',
	'preferredquality': '192',
	}],
	'postprocessor_args': ['-ar', '16000'], # Ensure audio is at 16000 Hz
	'prefer_ffmpeg': True,
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	# Transcribe the downloaded audio
	return transcribe_speech(audio_path)

	with gr.Blocks() as demo:
	with gr.Tab("Microphone Input"):
	gr.Markdown("## Transcribe speech from microphone")
	mic_audio = gr.Audio(source="microphone", type="filepath", label="Speak into your microphone")
	transcribe_button = gr.Button("Transcribe")
	transcription_output = gr.Textbox(label="Transcription")

	transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)

	with gr.Tab("YouTube URL"):
	gr.Markdown("## Transcribe speech from YouTube video")
	youtube_url = gr.Textbox(label="Enter YouTube video URL")
	transcribe_button = gr.Button("Transcribe")
	transcription_output = gr.Textbox(label="Transcription")

	transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)

	demo.launch()