# backend.py
import os
import torch
import re  # Required for tokenization and splitting
from nemo.collections.asr.models import EncDecRNNTBPEModel

class KabyleASR:
    def __init__(self):
        self.device = "cpu"  # Free tier uses CPU
        self.model = None
        print("Loading NeMo ASR model for Kabyle (CPU mode)...")
        try:
            # Load the pre-trained Kabyle Conformer Transducer model
            self.model = EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large")
            self.model = self.model.to(self.device)
            
            # Optimize for CPU inference
            self.model.preprocessor.featurizer.dither = 0.0
            self.model.preprocessor.featurizer.pad_to = 0
            print("Model loaded successfully.")
        except Exception as e:
            raise RuntimeError(f"Failed to load NeMo model: {str(e)}")

    def post_process_kabyle_text(self, text):
        """
        Corrects annexation in Kabyle transcription by replacing spaces with dashes.
        This function implements a structured set of rules based on the provided
        'Kabyle Transcription post script precessing rules v5.pdf' and other versions,
        now incorporating the user's nuanced understanding of 'StPa' particles.

        Args:
            text (str): The raw transcribed text from the ASR model.
        Returns:
            str: The post-processed text with correct annexation dashes.
        """
        # Dictionaries for a set of particles based on the rules document v5.
        # Based on kabyle_asr_optimized.py

        CoPa = {'d', 'n', 's'}  # Conjunction particles
        PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'nteɣ', 'nwen', 'nwent', 'nsen', 'nsent',
                 'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 'tneɣ', 'tenteɣ', 'twen', 'twent', 'tsen', 'tsent'}  # Possessive pronouns
        SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'yes', 'yis', 'ddaw', 'nnig', 'yid', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'}  # Special words
        StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'}  # State particles special
        StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent',
                'k', 'm', 'nteɣ', 'wen', 'went', 'sen', 'sent', 'atneɣ', 'atenteɣ', 'atwen', 'atwent', 'atsen', 'atsen'}  # State particles
        DePa = {'a', 'agi', 'nni', 'ihin', 'nniḍen'}  # Demonstration particles
        DiPa = {'id', 'in'}  # Direction particles
        FuPa = {'ad', 'ara', 'ur', 'i'}  # Future and negative particles
        DiObPa = {'yi', 'k', 'kem', 't', 'tt', 'aɣ', 'ken', 'kent', 'ten', 'tent',
                  'iyi', 'ak', 'am', 'as', 'awen', 'awent', 'asen', 'asent', 'iyi', 'ik', 'ikem', 'it', 'itt', 'aɣ', 'iken',
                  'ikent', 'iten', 'itent'}  # Direct object particles
        InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'}  # Indirect object particles

        # Combined set for general lookup, used to define a "word"
        all_particles = CoPa.union(PoPro).union(SpWo).union(StPa).union(StPaSp).union(DePa).union(DiPa).union(FuPa).union(DiObPa).union(InObPa)

        # Sets for specific rules
        rule_9_particles = DiObPa.union(InObPa).union(DiPa).union(StPa)
        full_stpa_set = StPa.union(StPaSp).union(DiObPa)  
        rule_11_particles = all_particles

        # Tokenize the text by splitting on spaces and existing dashes
        tokens = re.split(r'[\s\-]+', text.lower().strip())
        processed_tokens = []
        i = 0
        while i < len(tokens):
            current_token = tokens[i]

            # --- Rule 13: Exception after FuPa ---
            # This rule modifies Rule 12's behavior when preceded by a FuPa particle.
            if i > 0 and tokens[i-1] in FuPa:
                annexation_chain = []
                j = i
                while j < len(tokens) and tokens[j] in DiObPa.union(DiPa).union(CoPa):
                    annexation_chain.append(tokens[j])
                    j += 1
                if annexation_chain and j < len(tokens) and tokens[j] not in all_particles and len(tokens[j]) >= 2:
                    processed_tokens.append("-".join(annexation_chain) + "-" + tokens[j])
                    i = j + 1
                    continue
            
            # --- Rule 11: FuPa followed by an annexation chain tied to a word ---
            if current_token in FuPa:
                chain_start_idx = i + 1
                annexation_chain = []
                j = chain_start_idx
                while j < len(tokens) and tokens[j] in rule_11_particles:
                    annexation_chain.append(tokens[j])
                    j += 1
                if annexation_chain and j < len(tokens) and tokens[j] not in all_particles and len(tokens[j]) >= 2:
                    processed_tokens.append(current_token)
                    annexed_part = "-".join(annexation_chain) + "-" + tokens[j]
                    processed_tokens.append(annexed_part)
                    i = j + 1
                    continue
            
            # --- Rule 12: Annex a combination of particles before a regular word ---
            annexation_chain_start_idx = i
            annexation_chain = []
            j = annexation_chain_start_idx
            while j < len(tokens) and tokens[j] in DiObPa.union(DiPa):
                annexation_chain.append(tokens[j])
                j += 1
            
            if annexation_chain and j < len(tokens) and tokens[j] not in all_particles and len(tokens[j]) >= 2:
                annexed_part = "-".join(annexation_chain) + "-" + tokens[j]
                processed_tokens.append(annexed_part)
                i = j + 1
                continue
            
            # --- Rule 5: Annex StPa to SpWo ---
            if current_token in SpWo and i + 1 < len(tokens) and tokens[i + 1] in full_stpa_set:
                annexed_part = f"{current_token}-{tokens[i+1]}"
                processed_tokens.append(annexed_part)
                i += 2
                continue
            
            # --- Rule 7: Annex DePa to a word of 2+ letters ---
            is_regular_word = current_token not in all_particles
            if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i + 1] in DePa:
                processed_tokens.append(f"{current_token}-{tokens[i+1]}")
                i += 2
                continue
            
            # --- Rule 3: Annex PoPro to a word of 2+ letters ---
            is_regular_word = current_token not in all_particles
            if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i + 1] in PoPro:
                processed_tokens.append(f"{current_token}-{tokens[i+1]}")
                i += 2
                continue
            
            # --- Rule 9: Annex a combination of particles to a regular word ---
            is_regular_word = current_token not in all_particles
            if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens):
                annexation_chain = []
                j = i + 1
                while j < len(tokens) and tokens[j] in rule_9_particles:
                    annexation_chain.append(tokens[j])
                    j += 1
                if annexation_chain:
                    processed_tokens.append(f"{current_token}-" + "-".join(annexation_chain))
                    i = j
                    continue
            
            # --- Handle all other tokens as they are (no annexation) ---
            processed_tokens.append(current_token)
            i += 1
            
        final_text = " ".join(processed_tokens)
        return final_text

    def transcribe(self, audio_file):
        """
        Transcribe an audio file and apply Kabyle-specific post-processing.
        Args:
            audio_file (str): Path to the uploaded audio file
        Returns:
            str: Clean, grammatically improved transcription
        """
        if not os.path.exists(audio_file):
            return "Error: Audio file not found."

        try:
            # Transcribe using NeMo
            with torch.no_grad():
                result = self.model.transcribe([audio_file], batch_size=1, num_workers=0)

            # Extract text from Hypothesis object
            hypothesis = result[0]
            if hasattr(hypothesis, 'text'):
                raw_text = hypothesis.text.strip()
            else:
                raw_text = str(hypothesis).strip()

            if not raw_text:
                return "Transcription returned no text."

            # Apply Kabyle grammar post-processing
            final_text = self.post_process_kabyle_text(raw_text)
            return final_text

        except Exception as e:
            return f"Transcription error: {str(e)}"