import torch
import librosa
import noisereduce as nr
import numpy as np
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer

class AudioSpeechNERPipeline:
    def __init__(
        self, 
        stt_model_name='abduaziz/whisper-small-uzbek', 
        ner_model_name='abduaziz/roberta-ner-uzbek', 
        stt_language='uz',
        chunk_duration=30
    ):
        # Use lazy loading for pipelines
        self.stt_pipeline = None
        self.ner_pipeline = None
        self.stt_model_name = stt_model_name
        self.ner_model_name = ner_model_name
        self.chunk_duration = chunk_duration

    def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
        try:
            # Load processor
            processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")
            
            # Load model
            model = WhisperForConditionalGeneration.from_pretrained(model_name)
            
            return model, processor
        
        except Exception as e:
            print(f"Error loading Whisper model: {e}")
            raise

    def _load_pipelines(self):
        """Lazy load pipelines only when needed"""
        if self.stt_pipeline is None:
            # Load Whisper model and processor explicitly
            model, processor = self.load_whisper_model(self.stt_model_name)
            tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
            self.stt_pipeline = pipeline(
                "automatic-speech-recognition",
                model=model,
                processor=processor,
                feature_extractor = processor.feature_extractor,
                tokenizer=tokenizer,
                return_timestamps=True
            )
        if self.ner_pipeline is None:
            self.ner_pipeline = pipeline(
                task="ner",
                model=self.ner_model_name
            )

    def chunk_audio(self, audio, sample_rate):
        """More efficient audio chunking"""
        chunk_samples = self.chunk_duration * sample_rate
        return [
            {'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
            for start in range(0, len(audio), chunk_samples)
        ]

    def transcribe_audio(self, audio_path):
        """Enhanced audio transcription with better error handling"""
        self._load_pipelines()
        
        audio, sample_rate = librosa.load(audio_path, sr=16000)
        preprocessed_audio = preprocess_audio(audio, sr=sample_rate)
        
        if preprocessed_audio is None:
            raise ValueError("Audio preprocessing failed")

        if len(preprocessed_audio) / sample_rate > self.chunk_duration:
            chunks = self.chunk_audio(preprocessed_audio, sample_rate)
            transcriptions = [
                self.stt_pipeline(chunk)['text'] for chunk in chunks
            ]
            return " ".join(transcriptions)
        
        return self.stt_pipeline({
            'array': preprocessed_audio,
            'sampling_rate': sample_rate
        })['text']

    def process_audio(self, audio_path):
        """Streamlined audio processing"""
        transcription = self.transcribe_audio(audio_path)
        
        self._load_pipelines()
        entities = self.ner_pipeline(transcription)
        
        return transcription, entities

def preprocess_audio(audio_array, sr=16000):
    """Improved audio preprocessing with better type handling"""
    try:
        # Handle tensor or numpy array input
        if isinstance(audio_array, torch.Tensor):
            audio_array = audio_array.numpy()
        
        # Convert stereo to mono
        if audio_array.ndim > 1:
            audio_array = audio_array.mean(axis=0)
        
        # Noise reduction and normalization
        noise_reduced = nr.reduce_noise(
            y=audio_array, 
            sr=sr, 
            prop_decrease=0.5,
            n_std_thresh_stationary=1.5
        )
        
        normalized_audio = librosa.util.normalize(noise_reduced)
        trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)
        
        return trimmed_audio.astype(np.float32)
    
    except Exception as e:
        print(f"Audio preprocessing error: {e}")
        return None