Voice_Assistant / app.py
Somnath3570's picture
Update app.py
04cf931 verified
import gradio as gr
import torch
import transformers
import librosa
import numpy as np
import os
class UltravoxInterface:
def __init__(self):
"""Initialize with smaller model footprint"""
print("Initializing voice interface...")
# Use smaller whisper model
self.model_name = "openai/whisper-small"
self.pipe = transformers.pipeline(
"automatic-speech-recognition",
model=self.model_name,
torch_dtype=torch.float16,
device="cpu" # Explicitly set to CPU
)
print("Model loaded successfully!")
def process_audio(self, audio_path, custom_prompt=None):
"""Process audio with optimized memory usage"""
try:
if audio_path is None:
return "Please provide an audio input."
# Load audio in chunks to save memory
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
# Process audio in smaller segments if needed
max_length = 30 * sr # 30 seconds chunks
if len(audio) > max_length:
segments = []
for i in range(0, len(audio), max_length):
segment = audio[i:i + max_length]
result = self.pipe(segment, batch_size=1)
segments.append(result["text"])
return " ".join(segments)
# Process shorter audio directly
result = self.pipe(audio, batch_size=1)
return result["text"]
except Exception as e:
return f"Error processing audio: {str(e)}"
def create_interface(self):
"""Create and configure the Gradio interface"""
interface = gr.Interface(
fn=self.process_audio,
inputs=[
gr.Audio(
label="Speak here",
sources=["microphone"],
type="filepath"
)
],
outputs=[
gr.Textbox(
label="Transcription",
lines=5,
placeholder="Transcription will appear here..."
)
],
title="Voice Assistant",
description="Speak into the microphone and get text transcription!",
theme=gr.themes.Soft(primary_hue="orange"),
examples=[[None]],
)
return interface
# Create the interface
app = UltravoxInterface()
interface = app.create_interface()
# Launch the interface - this is crucial for Hugging Face Spaces
interface.launch()