import gradio as gr
import torch
import transformers
import librosa
import numpy as np
import os

class UltravoxInterface:
    def __init__(self):
        """Initialize with smaller model footprint"""
        print("Initializing voice interface...")
        
        # Use smaller whisper model
        self.model_name = "openai/whisper-small"
        self.pipe = transformers.pipeline(
            "automatic-speech-recognition",
            model=self.model_name,
            torch_dtype=torch.float16,
            device="cpu"  # Explicitly set to CPU
        )
        
        print("Model loaded successfully!")
        
    def process_audio(self, audio_path, custom_prompt=None):
        """Process audio with optimized memory usage"""
        try:
            if audio_path is None:
                return "Please provide an audio input."
                
            # Load audio in chunks to save memory
            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
            
            # Process audio in smaller segments if needed
            max_length = 30 * sr  # 30 seconds chunks
            if len(audio) > max_length:
                segments = []
                for i in range(0, len(audio), max_length):
                    segment = audio[i:i + max_length]
                    result = self.pipe(segment, batch_size=1)
                    segments.append(result["text"])
                return " ".join(segments)
            
            # Process shorter audio directly
            result = self.pipe(audio, batch_size=1)
            return result["text"]
            
        except Exception as e:
            return f"Error processing audio: {str(e)}"

    def create_interface(self):
        """Create and configure the Gradio interface"""
        
        interface = gr.Interface(
            fn=self.process_audio,
            inputs=[
                gr.Audio(
                    label="Speak here",
                    sources=["microphone"],
                    type="filepath"
                )
            ],
            outputs=[
                gr.Textbox(
                    label="Transcription",
                    lines=5,
                    placeholder="Transcription will appear here..."
                )
            ],
            title="Voice Assistant",
            description="Speak into the microphone and get text transcription!",
            theme=gr.themes.Soft(primary_hue="orange"),
            examples=[[None]],
        )
        
        return interface

# Create the interface
app = UltravoxInterface()
interface = app.create_interface()

# Launch the interface - this is crucial for Hugging Face Spaces
interface.launch()