import os
# # ✅ Setup environment and backend
# os.environ["HF_HOME"] = "/tmp"
# torchaudio.set_audio_backend("soundfile")

import re
import traceback
import tempfile

from fastapi import FastAPI, UploadFile, File
from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import torchaudio  

# ✅ Setup environment and backend
os.environ["HF_HOME"] = "/tmp"
torchaudio.set_audio_backend("soundfile")


app = FastAPI()

# ✅ Load the processor and model
processor = Wav2Vec2Processor.from_pretrained("tacab/ASR_SOMALI")
model = Wav2Vec2ForCTC.from_pretrained("tacab/ASR_SOMALI")
model.to("cpu")

# ✅ Optional: pipeline ASR with word timestamps
asr = pipeline(
    "automatic-speech-recognition",
    model="tacab/ASR_SOMALI",
    tokenizer="tacab/ASR_SOMALI",
    chunk_length_s=30,
    stride_length_s=6,
    return_timestamps="word",
    device=-1
)

# ✅ Text auto-punctuation
def auto_punctuate(text):
    text = text.strip()

    def capitalize_sentences(text):
        sentences = re.split(r'(?<=[.?!])\s+', text)
        return '. '.join(s.strip().capitalize() for s in sentences if s)

    if '.' not in text and len(text.split()) > 5:
        text += '.'

    words = text.split()
    new_text = ""
    for i in range(0, len(words), 10):
        segment = " ".join(words[i:i+10])
        new_text += segment.strip().capitalize() + ". "

    return capitalize_sentences(new_text.strip())


@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...)):
    try:
        # ✅ Save audio file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            tmp.write(await file.read())
            tmp_path = tmp.name

        # ✅ Run ASR
        result = asr(tmp_path)
        raw_text = result.get("text", "").strip()

        if not raw_text:
            return {"text": "", "message": "⚠️ Qoraal lama helin."}

        cleaned_text = auto_punctuate(raw_text)

        return {
            "text": cleaned_text,
            "raw": raw_text,
            "timestamps": result.get("chunks", []),
            "message": "✅ Turjumaad guul leh"
        }

    except Exception as e:
        traceback.print_exc()
        return {"text": "", "message": f"❌ Khalad dhacay: {str(e)}"}