import gradio as gr import torch import transformers import librosa import numpy as np import os class UltravoxInterface: def __init__(self): """Initialize with smaller model footprint""" print("Initializing voice interface...") # Use smaller whisper model self.model_name = "openai/whisper-small" self.pipe = transformers.pipeline( "automatic-speech-recognition", model=self.model_name, torch_dtype=torch.float16, device="cpu" # Explicitly set to CPU ) print("Model loaded successfully!") def process_audio(self, audio_path, custom_prompt=None): """Process audio with optimized memory usage""" try: if audio_path is None: return "Please provide an audio input." # Load audio in chunks to save memory audio, sr = librosa.load(audio_path, sr=16000, mono=True) # Process audio in smaller segments if needed max_length = 30 * sr # 30 seconds chunks if len(audio) > max_length: segments = [] for i in range(0, len(audio), max_length): segment = audio[i:i + max_length] result = self.pipe(segment, batch_size=1) segments.append(result["text"]) return " ".join(segments) # Process shorter audio directly result = self.pipe(audio, batch_size=1) return result["text"] except Exception as e: return f"Error processing audio: {str(e)}" def create_interface(self): """Create and configure the Gradio interface""" interface = gr.Interface( fn=self.process_audio, inputs=[ gr.Audio( label="Speak here", sources=["microphone"], type="filepath" ) ], outputs=[ gr.Textbox( label="Transcription", lines=5, placeholder="Transcription will appear here..." ) ], title="Voice Assistant", description="Speak into the microphone and get text transcription!", theme=gr.themes.Soft(primary_hue="orange"), examples=[[None]], ) return interface # Create the interface app = UltravoxInterface() interface = app.create_interface() # Launch the interface - this is crucial for Hugging Face Spaces interface.launch()