"""Speech-to-text utilities with graceful fallbacks."""

from __future__ import annotations

import numpy as np

from backend.utils import device
import nemo.collections.asr as nemo_asr

try:
    import torch
    from transformers import pipeline
except ModuleNotFoundError:  # PyTorch or transformers not available on Python 3.13 wheels
    torch = None  # type: ignore
    pipeline = None  # type: ignore

try:
    from google.cloud import speech
except ModuleNotFoundError:
    speech = None  # type: ignore


_ASR_PIPELINE = None


def _huggingface_device() -> int | str | None:
    if device == "cuda":
        return 0
    if device == "mps":
        return "mps"
    return None


def _initialize_typhoon_pipeline():
    if torch is None or pipeline is None:
        return None
    device = 'cuda' if torch.cuda.is_available() else 'mps'
    print(f"Using device: {device}")
    print("Initializing Typhoon ASR pipeline...")
    asr_model = nemo_asr.models.ASRModel.from_pretrained(
    model_name="scb10x/typhoon-asr-realtime",
    map_location=device
)
    print("Typhoon ASR pipeline initialized.")
    return asr_model

def _initialize_whisper_pipeline():
    pipe = pipeline(
    task="automatic-speech-recognition",
    model="nectec/Pathumma-whisper-th-medium",
    chunk_length_s=30,
    device=device,
    model_kwargs={"torch_dtype": torch.bfloat16},
    )
    pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
    language='th',
    task="transcribe"   
    )
    return pipe
_ASR_TYPHOON = None
# _ASR_TYPHOON = _initialize_typhoon_pipeline()
_ASR_WHISPER = _initialize_whisper_pipeline()


def _transcribe_with_pipeline(audio_array: np.ndarray) -> str:
    output = _ASR_PIPELINE(audio_array)  # type: ignore[operator]
    if isinstance(output, dict):
        text = output.get("text", "")
    else:
        text = str(output)
    return text.replace("ทางลัด", "ทางรัฐ")


def _transcribe_with_google(audio_array: np.ndarray) -> str:
    if speech is None:
        raise RuntimeError("google-cloud-speech is not available")

    int16_audio = (audio_array * 32767.0).astype(np.int16)
    audio_bytes = int16_audio.tobytes()

    client = speech.SpeechClient()
    audio_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="th-TH",
        alternative_language_codes=["en-US"],
        model = "telephony"
    )
    audio_data = speech.RecognitionAudio(content=audio_bytes)
    response = client.recognize(config=audio_config, audio=audio_data)
    transcription = " ".join(
        result.alternatives[0].transcript for result in response.results
    )
    return transcription


def transcribe_audio(audio_array: np.ndarray) -> str:
    """Transcribe user audio with the best available backend."""
    if audio_array is None or not np.any(audio_array):
        return ""
    # if _ASR_TYPHOON:
    #     try:
    #         transcriptions = _ASR_PIPELINE.transcribe(audio=audio_array)
    #     except Exception as exc:
    #         print(f"Typhoon ASR pipeline failed: {exc}")
    if _ASR_WHISPER:
        try:
            transcription = _ASR_WHISPER(audio_array)["text"]
            return transcription
        except Exception as exc:
            print(f"Typhoon ASR pipeline failed: {exc}")

    try:
        return _transcribe_with_google(audio_array)
    except Exception as exc:
        print(f"ASR fallback failed: {exc}")
        return ""