Spaces:

harshananddev
/

speech-to-text

Sleeping

speech-to-text / app.py

Update app.py

fb2299d verified 12 months ago

1.62 kB

	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torch
	import torchaudio

	# Load pre-trained model and processor
	model_name = "facebook/wav2vec2-base-960h"
	processor = Wav2Vec2Processor.from_pretrained(model_name)
	model = Wav2Vec2ForCTC.from_pretrained(model_name)

	def speech_to_text(audio):
	try:
	if audio is None:
	return "No audio file provided."

	# Load audio file
	waveform, rate = torchaudio.load(audio)

	# Ensure the audio is mono
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Resample to 16000 Hz
	resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
	waveform = resampler(waveform)

	# Tokenize the waveform
	inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)

	# Perform inference
	with torch.no_grad():
	logits = model(**inputs).logits

	# Decode the output
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)[0]

	return transcription
	except Exception as e:
	return str(e)

	# Create Gradio interface
	iface = gr.Interface(
	fn=speech_to_text,
	inputs=gr.Audio(type="filepath", label="Input Audio"),
	outputs=gr.Textbox(label="Transcription"),
	live=True,
	title="Speech to Text",
	description="Speak into your microphone and get the transcribed text."
	)

	# Launch the interface
	iface.launch()