Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 8

Commit

85fa45c

1 Parent(s): 2877f54

Refactor code structure for improved readability and maintainability

Browse files

Files changed (6) hide show

src/AI_Models/wave2vec_inference.py +241 -37
src/apis/__pycache__/create_app.cpython-311.pyc +0 -0
src/apis/controllers/speaking_controller.py +1111 -938
src/apis/create_app.py +18 -0
src/apis/routes/ipa_route.py +1763 -0
src/apis/routes/speaking_route.py +13 -3

src/AI_Models/wave2vec_inference.py CHANGED Viewed

@@ -8,51 +8,164 @@ from transformers import (
 import onnxruntime as rt
 import numpy as np
 import librosa
 class Wave2Vec2Inference:
-    def __init__(self, model_name, hotwords=[], use_lm_if_possible=True, use_gpu=True):
-        self.device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"
         if use_lm_if_possible:
             self.processor = AutoProcessor.from_pretrained(model_name)
         else:
             self.processor = Wav2Vec2Processor.from_pretrained(model_name)
         self.model = AutoModelForCTC.from_pretrained(model_name)
         self.model.to(self.device)
         self.hotwords = hotwords
         self.use_lm_if_possible = use_lm_if_possible
     def buffer_to_text(self, audio_buffer):
         if len(audio_buffer) == 0:
             return ""
         inputs = self.processor(
-            torch.tensor(audio_buffer),
             sampling_rate=16_000,
             return_tensors="pt",
             padding=True,
         )
-        with torch.no_grad():
-            logits = self.model(
-                inputs.input_values.to(self.device),
-                attention_mask=inputs.attention_mask.to(self.device),
-            ).logits
         if hasattr(self.processor, "decoder") and self.use_lm_if_possible:
             transcription = self.processor.decode(
-                logits[0].cpu().numpy(),
                 hotwords=self.hotwords,
-                # hotword_weight=self.hotword_weight,
                 output_word_offsets=True,
             )
-            confidence = transcription.lm_score / len(transcription.text.split(" "))
             transcription: str = transcription.text
         else:
             predicted_ids = torch.argmax(logits, dim=-1)
             transcription: str = self.processor.batch_decode(predicted_ids)[0]
-            # confidence = self.confidence_score(logits, predicted_ids)
-        return transcription.lower()
     def confidence_score(self, logits, predicted_ids):
         scores = torch.nn.functional.softmax(logits, dim=-1)
@@ -67,48 +180,118 @@ class Wave2Vec2Inference:
         return total_average
     def file_to_text(self, filename):
-        import librosa
-        audio_input, samplerate = librosa.load(filename, sr=16000)
-        return self.buffer_to_text(audio_input)
 class Wave2Vec2ONNXInference:
     def __init__(self, model_name, onnx_path):
         self.processor = Wav2Vec2Processor.from_pretrained(model_name)
-        # self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
         options = rt.SessionOptions()
         options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
-        self.model = rt.InferenceSession(onnx_path, options)
     def buffer_to_text(self, audio_buffer):
         if len(audio_buffer) == 0:
             return ""
         inputs = self.processor(
-            torch.tensor(audio_buffer),
             sampling_rate=16_000,
             return_tensors="np",
             padding=True,
         )
-        input_values = inputs.input_values
         onnx_outputs = self.model.run(
-            None, {self.model.get_inputs()[0].name: input_values}
         )[0]
         prediction = np.argmax(onnx_outputs, axis=-1)
         transcription = self.processor.decode(prediction.squeeze().tolist())
-        return transcription.lower()
     def file_to_text(self, filename):
-        audio_input, samplerate = librosa.load(filename, sr=16000)
-        return self.buffer_to_text(audio_input)
 # took that script from: https://github.com/ccoreilly/wav2vec2-service/blob/master/convert_torch_to_onnx.py
 def convert_to_onnx(model_id_or_path, onnx_model_name):
     print(f"Converting {model_id_or_path} to onnx")
     model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
@@ -157,27 +340,48 @@ if __name__ == "__main__":
     from loguru import logger
     import time
-    asr = Wave2Vec2Inference("facebook/wav2vec2-large-960h-lv60-self")
-    # Warm up runs
-    print("Warming up...")
     for i in range(2):
-        asr.file_to_text("test.wav")
         print(f"Warm up {i+1} completed")
     # Test runs
-    print("Running tests...")
     times = []
     for i in range(10):
         start_time = time.time()
-        text = asr.file_to_text("test.wav")
         end_time = time.time()
         execution_time = end_time - start_time
         times.append(execution_time)
         print(f"Test {i+1}: {execution_time:.3f}s - {text}")
-    # Calculate average time
     average_time = sum(times) / len(times)
-    print(f"\nAverage execution time: {average_time:.3f}s")
-    print(f"Min time: {min(times):.3f}s")
-    print(f"Max time: {max(times):.3f}s")

 import onnxruntime as rt
 import numpy as np
 import librosa
+import warnings
+import os
+warnings.filterwarnings("ignore")
 class Wave2Vec2Inference:
+    def __init__(self, model_name, hotwords=[], use_lm_if_possible=True, use_gpu=True, enable_optimizations=True):
+        # Auto-detect best available device
+        if use_gpu:
+            if torch.backends.mps.is_available():
+                self.device = "mps"
+            elif torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = "cpu"
+        print(f"Using device: {self.device}")
+        # Set optimal torch settings for inference
+        torch.set_grad_enabled(False)  # Disable gradients globally for inference
+        if self.device == "cpu":
+            # CPU optimizations
+            torch.set_num_threads(torch.get_num_threads())  # Use all available CPU cores
+            torch.set_float32_matmul_precision('high')
+        elif self.device == "cuda":
+            # CUDA optimizations
+            torch.backends.cudnn.benchmark = True  # Enable cuDNN benchmark mode
+            torch.backends.cudnn.deterministic = False
+        elif self.device == "mps":
+            # MPS optimizations
+            torch.backends.mps.enable_fallback = True
         if use_lm_if_possible:
             self.processor = AutoProcessor.from_pretrained(model_name)
         else:
             self.processor = Wav2Vec2Processor.from_pretrained(model_name)
         self.model = AutoModelForCTC.from_pretrained(model_name)
         self.model.to(self.device)
+        # Set model to evaluation mode for inference optimization
+        self.model.eval()
+        # Try to optimize model for inference (safe version) - only if enabled
+        if enable_optimizations:
+            try:
+                # First try torch.compile (PyTorch 2.0+) - more robust
+                if hasattr(torch, 'compile') and self.device != "mps":  # MPS doesn't support torch.compile yet
+                    self.model = torch.compile(self.model, mode="reduce-overhead")
+                    print("Model compiled with torch.compile for faster inference")
+                else:
+                    # Alternative: try JIT scripting for older PyTorch versions
+                    try:
+                        scripted_model = torch.jit.script(self.model)
+                        if hasattr(torch.jit, 'optimize_for_inference'):
+                            scripted_model = torch.jit.optimize_for_inference(scripted_model)
+                            self.model = scripted_model
+                            print("Model optimized with JIT scripting")
+                    except Exception as jit_e:
+                        print(f"JIT optimization failed, using regular model: {jit_e}")
+            except Exception as e:
+                print(f"Model optimization failed, using regular model: {e}")
+        else:
+            print("Model optimizations disabled")
         self.hotwords = hotwords
         self.use_lm_if_possible = use_lm_if_possible
+        # Pre-allocate tensors for common audio lengths to avoid repeated allocation
+        self.tensor_cache = {}
+        # Warm up the model with a dummy input (only if optimizations enabled)
+        if enable_optimizations:
+            self._warmup_model()
+    def _warmup_model(self):
+        """Warm up the model with dummy input to optimize first inference"""
+        try:
+            dummy_audio = torch.zeros(16000, device=self.device)  # 1 second of silence
+            dummy_inputs = self.processor(
+                dummy_audio,
+                sampling_rate=16_000,
+                return_tensors="pt",
+                padding=True,
+            )
+            # Move inputs to device
+            dummy_inputs = {k: v.to(self.device) for k, v in dummy_inputs.items()}
+            # Run dummy inference
+            with torch.no_grad():
+                _ = self.model(
+                    dummy_inputs["input_values"],
+                    attention_mask=dummy_inputs.get("attention_mask")
+                )
+            print("Model warmed up successfully")
+        except Exception as e:
+            print(f"Warmup failed: {e}")
     def buffer_to_text(self, audio_buffer):
         if len(audio_buffer) == 0:
             return ""
+        # Convert to tensor with optimal dtype and device placement
+        if isinstance(audio_buffer, np.ndarray):
+            audio_tensor = torch.from_numpy(audio_buffer).float()
+        else:
+            audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
+        # Use optimized processing
         inputs = self.processor(
+            audio_tensor,
             sampling_rate=16_000,
             return_tensors="pt",
             padding=True,
         )
+        # Move to device in one operation
+        input_values = inputs.input_values.to(self.device, non_blocking=True)
+        attention_mask = inputs.attention_mask.to(self.device, non_blocking=True) if "attention_mask" in inputs else None
+        # Optimized inference with mixed precision for GPU
+        if self.device in ["cuda", "mps"]:
+            with torch.no_grad(), torch.autocast(device_type=self.device.replace("mps", "cpu"), enabled=self.device=="cuda"):
+                if attention_mask is not None:
+                    logits = self.model(input_values, attention_mask=attention_mask).logits
+                else:
+                    logits = self.model(input_values).logits
+        else:
+            # CPU inference optimization
+            with torch.no_grad():
+                if attention_mask is not None:
+                    logits = self.model(input_values, attention_mask=attention_mask).logits
+                else:
+                    logits = self.model(input_values).logits
+        # Optimized decoding
         if hasattr(self.processor, "decoder") and self.use_lm_if_possible:
+            # Move to CPU for decoder processing (decoder only works on CPU)
+            logits_cpu = logits[0].cpu().numpy()
             transcription = self.processor.decode(
+                logits_cpu,
                 hotwords=self.hotwords,
                 output_word_offsets=True,
             )
+            confidence = transcription.lm_score / max(len(transcription.text.split(" ")), 1)
             transcription: str = transcription.text
         else:
+            # Fast argmax on GPU/MPS, then move to CPU for batch_decode
             predicted_ids = torch.argmax(logits, dim=-1)
+            if self.device != "cpu":
+                predicted_ids = predicted_ids.cpu()
             transcription: str = self.processor.batch_decode(predicted_ids)[0]
+        return transcription.lower().strip()
     def confidence_score(self, logits, predicted_ids):
         scores = torch.nn.functional.softmax(logits, dim=-1)
         return total_average
     def file_to_text(self, filename):
+        # Optimized audio loading
+        try:
+            audio_input, samplerate = librosa.load(filename, sr=16000, dtype=np.float32)
+            return self.buffer_to_text(audio_input)
+        except Exception as e:
+            print(f"Error loading audio file {filename}: {e}")
+            return ""
 class Wave2Vec2ONNXInference:
     def __init__(self, model_name, onnx_path):
         self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+        # Optimized ONNX Runtime session
         options = rt.SessionOptions()
         options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
+        options.execution_mode = rt.ExecutionMode.ORT_PARALLEL
+        options.inter_op_num_threads = 0  # Use all available cores
+        options.intra_op_num_threads = 0  # Use all available cores
+        # Enable CPU optimizations
+        providers = []
+        if rt.get_device() == 'GPU':
+            providers.append('CUDAExecutionProvider')
+        providers.extend(['CPUExecutionProvider'])
+        self.model = rt.InferenceSession(
+            onnx_path,
+            options,
+            providers=providers
+        )
+        # Pre-compile input name for faster access
+        self.input_name = self.model.get_inputs()[0].name
+        print(f"ONNX model loaded with providers: {self.model.get_providers()}")
     def buffer_to_text(self, audio_buffer):
         if len(audio_buffer) == 0:
             return ""
+        # Optimized preprocessing
+        if isinstance(audio_buffer, np.ndarray):
+            audio_tensor = torch.from_numpy(audio_buffer).float()
+        else:
+            audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
         inputs = self.processor(
+            audio_tensor,
             sampling_rate=16_000,
             return_tensors="np",
             padding=True,
         )
+        # Optimized ONNX inference
+        input_values = inputs.input_values.astype(np.float32)
         onnx_outputs = self.model.run(
+            None,
+            {self.input_name: input_values}
         )[0]
+        # Fast argmax and decoding
         prediction = np.argmax(onnx_outputs, axis=-1)
         transcription = self.processor.decode(prediction.squeeze().tolist())
+        return transcription.lower().strip()
     def file_to_text(self, filename):
+        try:
+            audio_input, samplerate = librosa.load(filename, sr=16000, dtype=np.float32)
+            return self.buffer_to_text(audio_input)
+        except Exception as e:
+            print(f"Error loading audio file {filename}: {e}")
+            return ""
 # took that script from: https://github.com/ccoreilly/wav2vec2-service/blob/master/convert_torch_to_onnx.py
+class OptimizedWave2Vec2Factory:
+    """Factory class to create the most optimized Wave2Vec2 inference instance"""
+    @staticmethod
+    def create_optimized_inference(model_name, onnx_path=None, safe_mode=False, **kwargs):
+        """
+        Create the most optimized inference instance based on available resources
+        Args:
+            model_name: HuggingFace model name
+            onnx_path: Path to ONNX model (optional, for maximum speed)
+            safe_mode: If True, disable aggressive optimizations that might cause issues
+            **kwargs: Additional arguments for Wave2Vec2Inference
+        Returns:
+            Optimized inference instance
+        """
+        if onnx_path and os.path.exists(onnx_path):
+            print("Using ONNX model for maximum speed")
+            return Wave2Vec2ONNXInference(model_name, onnx_path)
+        else:
+            print("Using PyTorch model with optimizations")
+            # In safe mode, disable optimizations that might cause issues
+            if safe_mode:
+                kwargs['enable_optimizations'] = False
+                print("Running in safe mode - optimizations disabled")
+            return Wave2Vec2Inference(model_name, **kwargs)
+    @staticmethod
+    def create_safe_inference(model_name, **kwargs):
+        """Create a safe inference instance without aggressive optimizations"""
+        kwargs['enable_optimizations'] = False
+        return Wave2Vec2Inference(model_name, **kwargs)
 def convert_to_onnx(model_id_or_path, onnx_model_name):
     print(f"Converting {model_id_or_path} to onnx")
     model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
     from loguru import logger
     import time
+    # Use optimized factory to create the best inference instance
+    asr = OptimizedWave2Vec2Factory.create_optimized_inference(
+        "facebook/wav2vec2-large-960h-lv60-self"
+    )
+    # Test if file exists
+    test_file = "test.wav"
+    if not os.path.exists(test_file):
+        print(f"Test file {test_file} not found. Please provide a valid audio file.")
+        exit(1)
+    # Warm up runs (model already warmed up during initialization)
+    print("Running additional warm-up...")
     for i in range(2):
+        asr.file_to_text(test_file)
         print(f"Warm up {i+1} completed")
     # Test runs
+    print("Running optimized performance tests...")
     times = []
     for i in range(10):
         start_time = time.time()
+        text = asr.file_to_text(test_file)
         end_time = time.time()
         execution_time = end_time - start_time
         times.append(execution_time)
         print(f"Test {i+1}: {execution_time:.3f}s - {text}")
+    # Calculate statistics
     average_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+    std_time = np.std(times)
+    print(f"\n=== Performance Statistics ===")
+    print(f"Average execution time: {average_time:.3f}s")
+    print(f"Min time: {min_time:.3f}s")
+    print(f"Max time: {max_time:.3f}s")
+    print(f"Standard deviation: {std_time:.3f}s")
+    print(f"Speed improvement: ~{((max_time - min_time) / max_time * 100):.1f}% faster (min vs max)")
+    # Calculate throughput
+    if times:
+        throughput = 1.0 / average_time
+        print(f"Average throughput: {throughput:.2f} inferences/second")

src/apis/__pycache__/create_app.cpython-311.pyc CHANGED Viewed

Binary files a/src/apis/__pycache__/create_app.cpython-311.pyc and b/src/apis/__pycache__/create_app.cpython-311.pyc differ

src/apis/controllers/speaking_controller.py CHANGED Viewed

@@ -1,18 +1,19 @@
-from typing import List, Dict
 import numpy as np
 import librosa
 import nltk
 import eng_to_ipa as ipa
-import torch
 import re
 from collections import defaultdict
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
 from loguru import logger
 import time
 from src.AI_Models.wave2vec_inference import (
     Wave2Vec2Inference,
     Wave2Vec2ONNXInference,
     export_to_onnx,
 )
@@ -24,8 +25,34 @@ except:
     print("Warning: NLTK data not available")
-class Wav2Vec2CharacterASR:
-    """Wav2Vec2 character-level ASR with support for both ONNX and Transformers inference"""
     def __init__(
         self,
@@ -33,605 +60,484 @@ class Wav2Vec2CharacterASR:
         onnx: bool = False,
         quantized: bool = False,
     ):
-        """
-        Initialize Wav2Vec2 character-level model
-        Args:
-            model_name: HuggingFace model name
-            onnx: If True, use ONNX runtime for inference. If False, use Transformers
-            onnx_model_path: Path to the ONNX model file (only used if onnx=True)
-        """
         self.use_onnx = onnx
         self.sample_rate = 16000
         self.model_name = model_name
-        # Check thử path của onnx model có tồn tại hay không
         if onnx:
             import os
-            if not os.path.exists(
-                "wav2vec2-large-960h-lv60-self"
-                + (".quant" if quantized else "")
-                + ".onnx"
-            ):
                 export_to_onnx(model_name, quantize=quantized)
-        self.model = (
-            Wave2Vec2Inference(model_name)
-            if not onnx
-            else Wave2Vec2ONNXInference(
-                model_name,
-                "wav2vec2-large-960h-lv60-self"
-                + (".quant" if quantized else "")
-                + ".onnx",
-            )
         )
-    def transcribe_to_characters(self, audio_path: str) -> Dict:
         try:
             start_time = time.time()
             character_transcript = self.model.file_to_text(audio_path)
-            character_transcript = self._clean_character_transcript(
-                character_transcript
-            )
-            phoneme_like_transcript = self._characters_to_phoneme_representation(
-                character_transcript
-            )
-            logger.info(f"Transcription time: {time.time() - start_time:.2f}s")
             return {
                 "character_transcript": character_transcript,
-                "phoneme_representation": phoneme_like_transcript,
             }
         except Exception as e:
-            print(f"Transformers transcription error: {e}")
             return self._empty_result()
-    def _calculate_confidence_scores(self, logits: np.ndarray) -> List[float]:
-        """Calculate confidence scores from logits using numpy"""
-        # Apply softmax
-        exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
-        softmax_probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
-        # Get max probabilities
-        max_probs = np.max(softmax_probs, axis=-1)[0]
-        return max_probs.tolist()
     def _clean_character_transcript(self, transcript: str) -> str:
         """Clean and standardize character transcript"""
-        # Remove extra spaces and special tokens
         logger.info(f"Raw transcript before cleaning: {transcript}")
-        cleaned = re.sub(r"\s+", " ", transcript)
-        cleaned = cleaned.strip().lower()
-        return cleaned
     def _characters_to_phoneme_representation(self, text: str) -> str:
-        """Convert character-based transcript to phoneme-like representation for comparison"""
         if not text:
             return ""
         words = text.split()
         phoneme_words = []
-        g2p = SimpleG2P()
         for word in words:
             try:
                 if g2p:
-                    word_data = g2p.text_to_phonemes(word)[0]
-                    phoneme_words.extend(word_data["phonemes"])
                 else:
                     phoneme_words.extend(self._simple_letter_to_phoneme(word))
             except:
-                # Fallback: simple letter-to-sound mapping
                 phoneme_words.extend(self._simple_letter_to_phoneme(word))
         return " ".join(phoneme_words)
     def _simple_letter_to_phoneme(self, word: str) -> List[str]:
-        """Simple fallback letter-to-phoneme conversion"""
         letter_to_phoneme = {
-            "a": "æ",
-            "b": "b",
-            "c": "k",
-            "d": "d",
-            "e": "ɛ",
-            "f": "f",
-            "g": "ɡ",
-            "h": "h",
-            "i": "ɪ",
-            "j": "dʒ",
-            "k": "k",
-            "l": "l",
-            "m": "m",
-            "n": "n",
-            "o": "ʌ",
-            "p": "p",
-            "q": "k",
-            "r": "r",
-            "s": "s",
-            "t": "t",
-            "u": "ʌ",
-            "v": "v",
-            "w": "w",
-            "x": "ks",
-            "y": "j",
-            "z": "z",
         }
-        phonemes = []
-        for letter in word.lower():
-            if letter in letter_to_phoneme:
-                phonemes.append(letter_to_phoneme[letter])
-        return phonemes
     def _empty_result(self) -> Dict:
-        """Return empty result structure"""
         return {
             "character_transcript": "",
             "phoneme_representation": "",
-            "raw_predicted_ids": [],
-            "confidence_scores": [],
         }
-    def get_model_info(self) -> Dict:
-        """Get information about the loaded model"""
-        info = {
-            "model_name": self.model_name,
-            "sample_rate": self.sample_rate,
-            "inference_method": "ONNX" if self.use_onnx else "Transformers",
-        }
-        if self.use_onnx:
-            info.update(
-                {
-                    "onnx_model_path": self.onnx_model_path,
-                    "input_name": self.input_name,
-                    "output_name": self.output_name,
-                    "session_providers": self.session.get_providers(),
-                }
-            )
-        return info
-class SimpleG2P:
-    """Simple Grapheme-to-Phoneme converter for reference text"""
     def __init__(self):
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
-            print("Warning: CMU dictionary not available")
     def text_to_phonemes(self, text: str) -> List[Dict]:
-        """Convert text to phoneme sequence"""
         words = self._clean_text(text).split()
         phoneme_sequence = []
         for word in words:
-            word_phonemes = self._get_word_phonemes(word)
-            phoneme_sequence.append(
-                {
-                    "word": word,
-                    "phonemes": word_phonemes,
-                    "ipa": self._get_ipa(word),
-                    "phoneme_string": " ".join(word_phonemes),
-                }
-            )
         return phoneme_sequence
-    def get_reference_phoneme_string(self, text: str) -> str:
-        """Get reference phoneme string for comparison"""
-        phoneme_sequence = self.text_to_phonemes(text)
-        all_phonemes = []
-        for word_data in phoneme_sequence:
-            all_phonemes.extend(word_data["phonemes"])
-        return " ".join(all_phonemes)
-    def _clean_text(self, text: str) -> str:
-        """Clean text for processing"""
-        text = re.sub(r"[^\w\s\']", " ", text)
-        text = re.sub(r"\s+", " ", text)
-        return text.lower().strip()
-    def _get_word_phonemes(self, word: str) -> List[str]:
-        """Get phonemes for a word"""
-        word_lower = word.lower()
-        if word_lower in self.cmu_dict:
-            # Remove stress markers and convert to Wav2Vec2 phoneme format
-            phonemes = self.cmu_dict[word_lower][0]
-            clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
-            return self._convert_to_wav2vec_format(clean_phonemes)
-        else:
-            return self._estimate_phonemes(word)
-    def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
-        """Convert CMU phonemes to Wav2Vec2 format"""
-        # Mapping from CMU to Wav2Vec2/eSpeak phonemes
-        cmu_to_espeak = {
-            "AA": "ɑ",
-            "AE": "æ",
-            "AH": "ʌ",
-            "AO": "ɔ",
-            "AW": "aʊ",
-            "AY": "aɪ",
-            "EH": "ɛ",
-            "ER": "ɝ",
-            "EY": "eɪ",
-            "IH": "ɪ",
-            "IY": "i",
-            "OW": "oʊ",
-            "OY": "ɔɪ",
-            "UH": "ʊ",
-            "UW": "u",
-            "B": "b",
-            "CH": "tʃ",
-            "D": "d",
-            "DH": "ð",
-            "F": "f",
-            "G": "ɡ",
-            "HH": "h",
-            "JH": "dʒ",
-            "K": "k",
-            "L": "l",
-            "M": "m",
-            "N": "n",
-            "NG": "ŋ",
-            "P": "p",
-            "R": "r",
-            "S": "s",
-            "SH": "ʃ",
-            "T": "t",
-            "TH": "θ",
-            "V": "v",
-            "W": "w",
-            "Y": "j",
-            "Z": "z",
-            "ZH": "ʒ",
         }
-        converted = []
         for phoneme in cmu_phonemes:
-            converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
-            converted.append(converted_phoneme)
-        return converted
-    def _get_ipa(self, word: str) -> str:
-        """Get IPA transcription"""
-        try:
-            return ipa.convert(word)
-        except:
-            return f"/{word}/"
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
-        # Basic phoneme estimation with eSpeak-style output
         phoneme_map = {
-            "ch": ["tʃ"],
-            "sh": ["ʃ"],
-            "th": ["θ"],
-            "ph": ["f"],
-            "ck": ["k"],
-            "ng": ["ŋ"],
-            "qu": ["k", "w"],
-            "a": ["æ"],
-            "e": ["ɛ"],
-            "i": ["ɪ"],
-            "o": ["ʌ"],
-            "u": ["ʌ"],
-            "b": ["b"],
-            "c": ["k"],
-            "d": ["d"],
-            "f": ["f"],
-            "g": ["ɡ"],
-            "h": ["h"],
-            "j": ["dʒ"],
-            "k": ["k"],
-            "l": ["l"],
-            "m": ["m"],
-            "n": ["n"],
-            "p": ["p"],
-            "r": ["r"],
-            "s": ["s"],
-            "t": ["t"],
-            "v": ["v"],
-            "w": ["w"],
-            "x": ["k", "s"],
-            "y": ["j"],
-            "z": ["z"],
         }
-        word = word.lower()
         phonemes = []
         i = 0
         while i < len(word):
-            # Check 2-letter combinations first
             if i <= len(word) - 2:
-                two_char = word[i : i + 2]
                 if two_char in phoneme_map:
-                    phonemes.extend(phoneme_map[two_char])
                     i += 2
                     continue
-            # Single character
             char = word[i]
             if char in phoneme_map:
-                phonemes.extend(phoneme_map[char])
             i += 1
         return phonemes
-    def get_visualization_data(self, text: str) -> List[Dict]:
-        """Get visualization data for IPA representation"""
-        words = self._clean_text(text).split()
-        visualization_data = []
-        for word in words:
-            word_phonemes = self._get_word_phonemes(word)
-            ipa_transcription = self._get_ipa(word)
-            visualization_data.append({
-                "word": word,
-                "phonemes": word_phonemes,
-                "ipa": ipa_transcription,
-                "phoneme_string": " ".join(word_phonemes),
-                "visualization": self._create_phoneme_visualization(word_phonemes)
-            })
-        return visualization_data
     def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
         """Create visualization data for phonemes"""
         visualization = []
         for phoneme in phonemes:
-            # Map phonemes to color categories for visualization
             color_category = self._get_phoneme_color_category(phoneme)
             visualization.append({
                 "phoneme": phoneme,
                 "color_category": color_category,
-                "description": self._get_phoneme_description(phoneme)
             })
         return visualization
     def _get_phoneme_color_category(self, phoneme: str) -> str:
         """Categorize phonemes by color for visualization"""
         vowel_phonemes = {"ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u"}
-        consonant_phonemes = {
-            # Plosives
-            "p", "b", "t", "d", "k", "ɡ",
-            # Nasals
-            "m", "n", "ŋ",
-            # Fricatives
-            "f", "v", "θ", "ð", "s", "z", "ʃ", "ʒ", "h",
-            # Affricates
-            "tʃ", "dʒ",
-            # Liquids
-            "l", "r",
-            # Glides
-            "w", "j"
-        }
         if phoneme in vowel_phonemes:
             return "vowel"
-        elif phoneme in consonant_phonemes:
-            return "consonant"
         else:
-            return "other"
     def _get_phoneme_description(self, phoneme: str) -> str:
         """Get description for a phoneme"""
         descriptions = {
-            # Vowels
-            "ɑ": "Open back unrounded vowel (like 'a' in 'father')",
-            "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
-            "ʌ": "Open-mid back unrounded vowel (like 'u' in 'cup')",
-            "ɔ": "Open-mid back rounded vowel (like 'o' in 'thought')",
-            "aʊ": "Diphthong (like 'ow' in 'cow')",
-            "aɪ": "Diphthong (like 'i' in 'bike')",
-            "ɛ": "Open-mid front unrounded vowel (like 'e' in 'bed')",
-            "ɝ": "R-colored vowel (like 'er' in 'her')",
-            "eɪ": "Diphthong (like 'a' in 'cake')",
-            "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
-            "i": "Close front unrounded vowel (like 'ee' in 'see')",
-            "oʊ": "Diphthong (like 'o' in 'go')",
-            "ɔɪ": "Diphthong (like 'oy' in 'boy')",
-            "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')",
-            "u": "Close back rounded vowel (like 'oo' in 'food')",
-            # Consonants
-            "p": "Voiceless bilabial plosive (like 'p' in 'pen')",
-            "b": "Voiced bilabial plosive (like 'b' in 'bat')",
-            "t": "Voiceless alveolar plosive (like 't' in 'top')",
-            "d": "Voiced alveolar plosive (like 'd' in 'dog')",
-            "k": "Voiceless velar plosive (like 'c' in 'cat')",
-            "ɡ": "Voiced velar plosive (like 'g' in 'go')",
-            "m": "Bilabial nasal (like 'm' in 'man')",
-            "n": "Alveolar nasal (like 'n' in 'net')",
-            "ŋ": "Velar nasal (like 'ng' in 'sing')",
-            "f": "Voiceless labiodental fricative (like 'f' in 'fan')",
-            "v": "Voiced labiodental fricative (like 'v' in 'van')",
             "θ": "Voiceless dental fricative (like 'th' in 'think')",
             "ð": "Voiced dental fricative (like 'th' in 'this')",
-            "s": "Voiceless alveolar fricative (like 's' in 'sit')",
             "z": "Voiced alveolar fricative (like 'z' in 'zip')",
-            "ʃ": "Voiceless postalveolar fricative (like 'sh' in 'ship')",
             "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
-            "h": "Voiceless glottal fricative (like 'h' in 'hat')",
-            "tʃ": "Voiceless postalveolar affricate (like 'ch' in 'chat')",
-            "dʒ": "Voiced postalveolar affricate (like 'j' in 'jet')",
-            "l": "Alveolar lateral approximant (like 'l' in 'let')",
             "r": "Alveolar approximant (like 'r' in 'red')",
             "w": "Labial-velar approximant (like 'w' in 'wet')",
-            "j": "Palatal approximant (like 'y' in 'yes')",
         }
         return descriptions.get(phoneme, f"Phoneme: {phoneme}")
-class PhonemeComparator:
-    """Compare reference and learner phoneme sequences"""
-    def __init__(self):
-        # Vietnamese speakers' common phoneme substitutions
-        self.substitution_patterns = {
-            "θ": ["f", "s", "t"],  # TH → F, S, T
-            "ð": ["d", "z", "v"],  # DH → D, Z, V
-            "v": ["w", "f"],  # V → W, F
-            "r": ["l"],  # R → L
-            "l": ["r"],  # L → R
-            "z": ["s"],  # Z → S
-            "ʒ": ["ʃ", "z"],  # ZH → SH, Z
-            "ŋ": ["n"],  # NG → N
-        }
-        # Difficulty levels for Vietnamese speakers
-        self.difficulty_map = {
-            "θ": 0.9,  # th (think)
-            "ð": 0.9,  # th (this)
-            "v": 0.8,  # v
-            "z": 0.8,  # z
-            "ʒ": 0.9,  # zh (measure)
-            "r": 0.7,  # r
-            "l": 0.6,  # l
-            "w": 0.5,  # w
-            "f": 0.4,  # f
-            "s": 0.3,  # s
-            "ʃ": 0.5,  # sh
-            "tʃ": 0.4,  # ch
-            "dʒ": 0.5,  # j
-            "ŋ": 0.3,  # ng
-        }
-        # Additional Vietnamese substitution patterns
-        self.extended_substitution_patterns = {
-            # Common Vietnamese speaker errors
-            "θ": ["f", "s", "t", "d"],  # TH sound
-            "ð": ["d", "z", "v", "t"],  # DH sound
-            "v": ["w", "f", "b"],       # V sound
-            "w": ["v", "b"],            # W sound
-            "r": ["l", "n"],            # R sound
-            "l": ["r", "n"],            # L sound
-            "z": ["s", "j"],            # Z sound
-            "ʒ": ["ʃ", "z", "s"],       # ZH sound
-            "ʃ": ["s", "ʒ"],            # SH sound
-            "ŋ": ["n", "m"],            # NG sound
-            "tʃ": ["ʃ", "s", "k"],      # CH sound
-            "dʒ": ["ʒ", "j", "g"],      # J sound
-        }
-    def compare_phoneme_sequences(
-        self, reference_phonemes: str, learner_phonemes: str
-    ) -> List[Dict]:
-        """Compare reference and learner phoneme sequences"""
-        # Split phoneme strings
-        ref_phones = reference_phonemes.split()
-        learner_phones = learner_phonemes.split()
-        print(f"Reference phonemes: {ref_phones}")
-        print(f"Learner phonemes: {learner_phones}")
-        # Simple alignment comparison
         comparisons = []
-        max_len = max(len(ref_phones), len(learner_phones))
-        for i in range(max_len):
-            ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
-            learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
-            if ref_phoneme and learner_phoneme:
-                # Both present - check accuracy
-                if ref_phoneme == learner_phoneme:
-                    status = "correct"
-                    score = 1.0
-                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
-                    status = "acceptable"
                     score = 0.7
                 else:
-                    status = "wrong"
                     score = 0.2
-            elif ref_phoneme and not learner_phoneme:
-                # Missing phoneme
-                status = "missing"
-                score = 0.0
-            elif learner_phoneme and not ref_phoneme:
-                # Extra phoneme
-                status = "extra"
-                score = 0.0
-            else:
-                continue
-            comparison = {
-                "position": i,
-                "reference_phoneme": ref_phoneme,
-                "learner_phoneme": learner_phoneme,
-                "status": status,
-                "score": score,
-                "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
-            }
             comparisons.append(comparison)
         return comparisons
-    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
-        """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
-        acceptable = self.extended_substitution_patterns.get(reference, [])
-        return learner in acceptable
-# =============================================================================
-# WORD ANALYZER
-# =============================================================================
-class WordAnalyzer:
-    """Analyze word-level pronunciation accuracy using character-based ASR"""
     def __init__(self):
-        self.g2p = SimpleG2P()
-        self.comparator = PhonemeComparator()
-    def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
-        """Analyze word-level pronunciation using phoneme representation from character ASR"""
         # Get reference phonemes by word
         reference_words = self.g2p.text_to_phonemes(reference_text)
-        # Get overall phoneme comparison
-        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
-        phoneme_comparisons = self.comparator.compare_phoneme_sequences(
             reference_phoneme_string, learner_phonemes
         )
-        # Map phonemes back to words
-        word_highlights = self._create_word_highlights(
-            reference_words, phoneme_comparisons
         )
-        # Identify wrong words
-        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
         return {
             "word_highlights": word_highlights,
             "phoneme_differences": phoneme_comparisons,
             "wrong_words": wrong_words,
         }
-    def _create_word_highlights(
-        self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Create word highlighting data with enhanced visualization"""
         word_highlights = []
         phoneme_index = 0
@@ -642,15 +548,23 @@ class WordAnalyzer:
             # Get phoneme scores for this word
             word_phoneme_scores = []
             for j in range(num_phonemes):
                 if phoneme_index + j < len(phoneme_comparisons):
                     comparison = phoneme_comparisons[phoneme_index + j]
                     word_phoneme_scores.append(comparison["score"])
             # Calculate word score
             word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
-            # Create word highlight with enhanced visualization data
             highlight = {
                 "word": word,
                 "score": float(word_score),
@@ -661,8 +575,9 @@ class WordAnalyzer:
                 "phoneme_scores": word_phoneme_scores,
                 "phoneme_start_index": phoneme_index,
                 "phoneme_end_index": phoneme_index + num_phonemes - 1,
-                # Enhanced visualization data
-                "phoneme_visualization": self.g2p._create_phoneme_visualization(word_phonemes)
             }
             word_highlights.append(highlight)
@@ -670,17 +585,56 @@ class WordAnalyzer:
         return word_highlights
-    def _identify_wrong_words(
-        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Identify words that were pronounced incorrectly"""
         wrong_words = []
         for word_highlight in word_highlights:
-            if word_highlight["score"] < 0.6:  # Threshold for wrong pronunciation
-                # Find specific phoneme errors for this word
                 start_idx = word_highlight["phoneme_start_index"]
                 end_idx = word_highlight["phoneme_end_index"]
@@ -690,23 +644,19 @@ class WordAnalyzer:
                 for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
                     comparison = phoneme_comparisons[i]
-                    if comparison["status"] == "wrong":
-                        wrong_phonemes.append(
-                            {
-                                "expected": comparison["reference_phoneme"],
-                                "actual": comparison["learner_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                                "visualization": self.g2p._create_phoneme_visualization([comparison["reference_phoneme"]])[0]
-                            }
-                        )
-                    elif comparison["status"] == "missing":
-                        missing_phonemes.append(
-                            {
-                                "phoneme": comparison["reference_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                                "visualization": self.g2p._create_phoneme_visualization([comparison["reference_phoneme"]])[0]
-                            }
-                        )
                 wrong_word = {
                     "word": word_highlight["word"],
@@ -715,15 +665,64 @@ class WordAnalyzer:
                     "ipa": word_highlight["ipa"],
                     "wrong_phonemes": wrong_phonemes,
                     "missing_phonemes": missing_phonemes,
-                    "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
-                    # Enhanced visualization data
-                    "phoneme_visualization": word_highlight["phoneme_visualization"]
                 }
                 wrong_words.append(wrong_word)
         return wrong_words
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
@@ -746,14 +745,11 @@ class WordAnalyzer:
         else:
             return "#ef4444"  # Red
-    def _get_vietnamese_tips(
-        self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
-    ) -> List[str]:
-        """Get Vietnamese-specific pronunciation tips"""
         tips = []
-        # Tips for specific Vietnamese pronunciation challenges
         vietnamese_tips = {
             "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
             "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
@@ -763,433 +759,501 @@ class WordAnalyzer:
             "z": "Giống âm 's' nhưng có rung dây thanh âm",
             "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
             "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
         }
-        # Add tips for wrong phonemes
         for wrong in wrong_phonemes:
             expected = wrong["expected"]
-            actual = wrong["actual"]
             if expected in vietnamese_tips:
-                tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
-            else:
-                tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
-        # Add tips for missing phonemes
         for missing in missing_phonemes:
             phoneme = missing["phoneme"]
             if phoneme in vietnamese_tips:
-                tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
         return tips
-class SimpleFeedbackGenerator:
-    """Generate simple, actionable feedback in Vietnamese"""
-    def generate_feedback(
-        self,
-        overall_score: float,
-        wrong_words: List[Dict],
-        phoneme_comparisons: List[Dict],
-    ) -> List[str]:
-        """Generate Vietnamese feedback"""
-        feedback = []
-        # Overall feedback in Vietnamese
-        if overall_score >= 0.8:
-            feedback.append("Phát âm rất tốt! Bạn đã làm xuất sắc.")
-        elif overall_score >= 0.6:
-            feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
-        elif overall_score >= 0.4:
-            feedback.append(
-                "Cần luyện tập thêm. Tập trung vào những từ được ��ánh dấu đỏ."
-            )
         else:
-            feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
-        # Wrong words feedback
-        if wrong_words:
-            if len(wrong_words) <= 3:
-                word_names = [w["word"] for w in wrong_words]
-                feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
             else:
-                feedback.append(
-                    f"Có {len(wrong_words)} từ cần luyện tập. Tập trung vào từng từ một."
-                )
-        # Most problematic phonemes
-        problem_phonemes = defaultdict(int)
-        for comparison in phoneme_comparisons:
-            if comparison["status"] in ["wrong", "missing"]:
-                phoneme = comparison["reference_phoneme"]
-                problem_phonemes[phoneme] += 1
-        if problem_phonemes:
-            most_difficult = sorted(
-                problem_phonemes.items(), key=lambda x: x[1], reverse=True
-            )
-            top_problem = most_difficult[0][0]
-            phoneme_tips = {
-                "θ": "Lưỡi giữa răng, thổi nhẹ",
-                "ð": "Lưỡi giữa răng, rung dây thanh",
-                "v": "Môi dưới chạm răng trên",
-                "r": "Cuộn lưỡi, không chạm vòm miệng",
-                "l": "Lưỡi chạm vòm miệng",
-                "z": "Như 's' nhưng rung dây thanh",
-            }
-            if top_problem in phoneme_tips:
-                feedback.append(
-                    f"Âm khó nhất '{top_problem}': {phoneme_tips[top_problem]}"
-                )
-        return feedback
-class SimplePronunciationAssessor:
-    """Main pronunciation assessor supporting both normal (Whisper) and advanced (Wav2Vec2) modes
-    Backward compatible wrapper for EnhancedPronunciationAssessor"""
-    def __init__(self):
-        print("Initializing Simple Pronunciation Assessor...")
-        self.enhanced_assessor = EnhancedPronunciationAssessor()
-        print("Simple Pronunciation Assessor initialization completed")
-    def assess_pronunciation(
-        self, audio_path: str, reference_text: str, mode: str = "normal"
-    ) -> Dict:
-        """
-        Backward compatible assessment function with mode selection
-        Args:
-            audio_path: Path to audio file
-            reference_text: Reference text to compare
-            mode: 'normal' (Whisper), 'advanced' (Wav2Vec2), or 'auto' (determined by text length)
-        Output: Word highlights + Phoneme differences + Wrong words
-        """
-        print(f"Starting pronunciation assessment in {mode} mode...")
-        # Map old modes to new modes for backward compatibility
-        mode_mapping = {
-            "normal": "auto",
-            "advanced": "auto"
-        }
-        # Validate and map mode parameter
-        if mode in mode_mapping:
-            new_mode = mode_mapping[mode]
-            print(f"Mapping old mode '{mode}' to new mode '{new_mode}' for backward compatibility")
-        elif mode in ["word", "sentence", "auto"]:
-            new_mode = mode
-        else:
-            # Default to auto for any invalid mode
-            new_mode = "auto"
-            print(f"Invalid mode '{mode}' provided, defaulting to 'auto'")
-        # Use the enhanced assessor
-        result = self.enhanced_assessor.assess_pronunciation(
-            audio_path, reference_text, new_mode
-        )
-        # Filter result to maintain backward compatibility
-        compatible_result = {
-            "transcript": result["transcript"],
-            "transcript_phonemes": result["transcript_phonemes"],
-            "user_phonemes": result["user_phonemes"],
-            "character_transcript": result["character_transcript"],
-            "overall_score": result["overall_score"],
-            "word_highlights": result["word_highlights"],
-            "phoneme_differences": result["phoneme_differences"],
-            "wrong_words": result["wrong_words"],
-            "feedback": result["feedback"],
-            "processing_info": result["processing_info"],
-        }
-        # Add new fields if they exist (for newer clients)
-        if "reference_phonemes" in result:
-            compatible_result["reference_phonemes"] = result["reference_phonemes"]
-        if "phoneme_pairs" in result:
-            compatible_result["phoneme_pairs"] = result["phoneme_pairs"]
-        if "phoneme_comparison" in result:
-            compatible_result["phoneme_comparison"] = result["phoneme_comparison"]
-        if "prosody_analysis" in result:
-            compatible_result["prosody_analysis"] = result["prosody_analysis"]
-        print("Assessment completed successfully")
-        return compatible_result
-    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
-        """Calculate overall pronunciation score"""
-        if not phoneme_comparisons:
-            return 0.0
-        total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
-        return total_score / len(phoneme_comparisons)
-class EnhancedPronunciationAssessor:
-    """Enhanced pronunciation assessor with word mode and sentence mode support"""
-    def __init__(self):
-        print("Initializing Enhanced Pronunciation Assessor...")
-        self.wav2vec2_asr = Wav2Vec2CharacterASR()  # Advanced mode
-        self.whisper_asr = None  # Normal mode
-        self.word_analyzer = WordAnalyzer()
-        self.feedback_generator = SimpleFeedbackGenerator()
-        self.g2p = SimpleG2P()
-        self.comparator = PhonemeComparator()
-        print("Enhanced Pronunciation Assessor initialization completed")
-    def assess_pronunciation(
-        self, audio_path: str, reference_text: str, mode: str = "auto"
-    ) -> Dict:
         """
-        Enhanced assessment function with mode selection
         Args:
             audio_path: Path to audio file
-            reference_text: Reference text to compare
-            mode: 'word', 'sentence', or 'auto' (automatically determined based on text length)
         Returns:
-            Enhanced assessment results with prosody analysis for sentence mode
         """
-        print(f"Starting enhanced pronunciation assessment in {mode} mode...")
-        # Validate and normalize mode parameter
-        valid_modes = ["word", "sentence", "auto"]
-        if mode not in valid_modes:
-            print(f"Invalid mode '{mode}' provided, defaulting to 'auto'")
-            mode = "auto"
-        # Determine mode based on text length if auto
-        if mode == "auto":
-            word_count = len(reference_text.strip().split())
-            mode = "word" if word_count <= 3 else "sentence"
-            print(f"Auto-selected mode: {mode} (word count: {word_count})")
-        # Step 1: Transcription using Wav2Vec2 character model
-        print("Step 1: Using Wav2Vec2 character transcription...")
-        asr_result = self.wav2vec2_asr.transcribe_to_characters(audio_path)
-        model_info = f"Wav2Vec2-Character ({self.wav2vec2_asr.model})"
-        character_transcript = asr_result["character_transcript"]
-        phoneme_representation = asr_result["phoneme_representation"]
-        print(f"Character transcript: {character_transcript}")
-        print(f"Phoneme representation: {phoneme_representation}")
-        # Step 2: Word analysis using phoneme representation
-        print("Step 2: Analyzing words...")
-        analysis_result = self.word_analyzer.analyze_words(
-            reference_text, phoneme_representation
-        )
-        # Step 3: Calculate overall score
-        phoneme_comparisons = analysis_result["phoneme_differences"]
-        overall_score = self._calculate_overall_score(phoneme_comparisons)
-        # Step 4: Generate feedback
-        print("Step 3: Generating feedback...")
-        feedback = self.feedback_generator.generate_feedback(
-            overall_score, analysis_result["wrong_words"], phoneme_comparisons
-        )
-        # Step 5: Enhanced phoneme comparison using Levenshtein distance
-        print("Step 4: Performing advanced phoneme comparison...")
-        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
-        enhanced_comparisons = self._enhanced_phoneme_comparison(
-            reference_phoneme_string, phoneme_representation
-        )
-        # Step 6: Prosody analysis for sentence mode
-        prosody_analysis = {}
-        if mode == "sentence":
-            print("Step 5: Performing prosody analysis...")
-            prosody_analysis = self._analyze_prosody(audio_path, reference_text)
-        # Step 7: Create phoneme pairs for visualization
-        phoneme_pairs = self._create_phoneme_pairs(
-            reference_phoneme_string, phoneme_representation
-        )
-        # Step 8: Create phoneme comparison summary
-        phoneme_comparison_summary = self._create_phoneme_comparison_summary(
-            phoneme_pairs
-        )
-        result = {
-            "transcript": character_transcript,  # What user actually said
-            "transcript_phonemes": phoneme_representation,
-            "user_phonemes": phoneme_representation,  # Alias for UI clarity
-            "character_transcript": character_transcript,
-            "overall_score": overall_score,
-            "word_highlights": analysis_result["word_highlights"],
-            "phoneme_differences": enhanced_comparisons,
-            "wrong_words": analysis_result["wrong_words"],
-            "feedback": feedback,
-            "processing_info": {
-                "model_used": model_info,
-                "mode": mode,
-                "character_based": True,
-                "language_model_correction": False,
-                "raw_output": True,
-            },
-            # Enhanced features
-            "reference_phonemes": reference_phoneme_string,
-            "phoneme_pairs": phoneme_pairs,
-            "phoneme_comparison": phoneme_comparison_summary,
-            "prosody_analysis": prosody_analysis,
         }
-        print("Enhanced assessment completed successfully")
-        return result
-    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
-        """Calculate overall pronunciation score"""
-        if not phoneme_comparisons:
-            return 0.0
-        total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
-        return total_score / len(phoneme_comparisons)
-    def _enhanced_phoneme_comparison(self, reference: str, learner: str) -> List[Dict]:
-        """Enhanced phoneme comparison using Levenshtein distance"""
-        import difflib
-        # Split phoneme strings
-        ref_phones = reference.split()
-        learner_phones = learner.split()
-        # Use SequenceMatcher for alignment
-        matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
-        comparisons = []
-        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
-            if tag == 'equal':
-                # Correct phonemes
-                for k in range(i2 - i1):
-                    comparisons.append({
-                        "position": len(comparisons),
-                        "reference_phoneme": ref_phones[i1 + k],
-                        "learner_phoneme": learner_phones[j1 + k],
-                        "status": "correct",
-                        "score": 1.0,
-                        "difficulty": self.comparator.difficulty_map.get(ref_phones[i1 + k], 0.3),
-                    })
-            elif tag == 'delete':
-                # Missing phonemes
-                for k in range(i1, i2):
-                    comparisons.append({
-                        "position": len(comparisons),
-                        "reference_phoneme": ref_phones[k],
-                        "learner_phoneme": "",
-                        "status": "missing",
-                        "score": 0.0,
-                        "difficulty": self.comparator.difficulty_map.get(ref_phones[k], 0.3),
-                    })
-            elif tag == 'insert':
-                # Extra phonemes
-                for k in range(j1, j2):
-                    comparisons.append({
-                        "position": len(comparisons),
-                        "reference_phoneme": "",
-                        "learner_phoneme": learner_phones[k],
-                        "status": "extra",
-                        "score": 0.0,
-                        "difficulty": 0.3,
-                    })
-            elif tag == 'replace':
-                # Substituted phonemes
-                max_len = max(i2 - i1, j2 - j1)
-                for k in range(max_len):
-                    ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
-                    learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
-                    if ref_phoneme and learner_phoneme:
-                        # Both present - check if substitution is acceptable
-                        if self.comparator._is_acceptable_substitution(ref_phoneme, learner_phoneme):
-                            status = "acceptable"
-                            score = 0.7
-                        else:
-                            status = "wrong"
-                            score = 0.2
-                    elif ref_phoneme and not learner_phoneme:
-                        status = "missing"
-                        score = 0.0
-                    elif learner_phoneme and not ref_phoneme:
-                        status = "extra"
-                        score = 0.0
-                    else:
-                        continue
-                    comparisons.append({
-                        "position": len(comparisons),
-                        "reference_phoneme": ref_phoneme,
-                        "learner_phoneme": learner_phoneme,
-                        "status": status,
-                        "score": score,
-                        "difficulty": self.comparator.difficulty_map.get(ref_phoneme, 0.3),
-                    })
-        return comparisons
-    def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
-        """Create phoneme pairs for visualization"""
-        ref_phones = reference.split()
-        learner_phones = learner.split()
-        # Use SequenceMatcher for alignment
-        import difflib
-        matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
-        pairs = []
-        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
-            if tag == 'equal':
-                for k in range(i2 - i1):
-                    pairs.append({
-                        "reference": ref_phones[i1 + k],
-                        "learner": learner_phones[j1 + k],
-                        "match": True,
-                        "type": "correct"
-                    })
-            elif tag == 'replace':
-                max_len = max(i2 - i1, j2 - j1)
-                for k in range(max_len):
-                    ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
-                    learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
-                    pairs.append({
-                        "reference": ref_phoneme,
-                        "learner": learner_phoneme,
-                        "match": False,
-                        "type": "substitution"
-                    })
-            elif tag == 'delete':
-                for k in range(i1, i2):
-                    pairs.append({
-                        "reference": ref_phones[k],
-                        "learner": "",
-                        "match": False,
-                        "type": "deletion"
-                    })
-            elif tag == 'insert':
-                for k in range(j1, j2):
-                    pairs.append({
-                        "reference": "",
-                        "learner": learner_phones[k],
-                        "match": False,
-                        "type": "insertion"
-                    })
-        return pairs
     def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
-        """Create a summary of phoneme comparison statistics"""
         total = len(phoneme_pairs)
         correct = sum(1 for pair in phoneme_pairs if pair["match"])
         substitutions = sum(1 for pair in phoneme_pairs if pair["type"] == "substitution")
         deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
@@ -1201,81 +1265,190 @@ class EnhancedPronunciationAssessor:
             "substitutions": substitutions,
             "deletions": deletions,
             "insertions": insertions,
-            "accuracy_percentage": (correct / total * 100) if total > 0 else 0,
-            "error_rate": ((substitutions + deletions + insertions) / total * 100) if total > 0 else 0
         }
-    def _analyze_prosody(self, audio_path: str, reference_text: str) -> Dict:
-        """Analyze prosody features (pitch, rhythm, intensity)"""
-        try:
-            # Load audio file
-            import librosa
-            y, sr = librosa.load(audio_path, sr=16000)
-            # Extract prosodic features
-            # Pitch analysis
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            pitch_values = []
-            for i in range(pitches.shape[1]):
-                index = magnitudes[:, i].argmax()
-                pitch = pitches[index, i]
-                if pitch > 0:  # Only consider non-zero pitch values
-                    pitch_values.append(pitch)
-            avg_pitch = float(np.mean(pitch_values)) if pitch_values else 0.0
-            pitch_variability = float(np.std(pitch_values)) if pitch_values else 0.0
-            # Rhythm analysis (using zero-crossing rate as a proxy)
-            zcr = librosa.feature.zero_crossing_rate(y)
-            avg_zcr = float(np.mean(zcr))
-            # Intensity analysis (RMS energy)
-            rms = librosa.feature.rms(y=y)
-            avg_rms = float(np.mean(rms))
-            # Calculate speaking rate (words per minute)
-            duration = len(y) / sr  # in seconds
-            word_count = len(reference_text.split())
-            speaking_rate = (word_count / duration) * 60 if duration > 0 else 0  # words per minute
-            # Provide feedback based on prosodic features
-            prosody_feedback = []
-            if speaking_rate < 100:
-                prosody_feedback.append("Speaking rate is quite slow. Try to speak at a more natural pace.")
-            elif speaking_rate > 200:
-                prosody_feedback.append("Speaking rate is quite fast. Try to slow down for better clarity.")
-            else:
-                prosody_feedback.append("Speaking rate is good.")
-            if pitch_variability < 50:
-                prosody_feedback.append("Pitch variability is low. Try to use more intonation to make speech more expressive.")
-            else:
-                prosody_feedback.append("Good pitch variability, which makes speech more engaging.")
-            return {
-                "pitch": {
-                    "average": avg_pitch,
-                    "variability": pitch_variability
-                },
-                "rhythm": {
-                    "zero_crossing_rate": avg_zcr
-                },
-                "intensity": {
-                    "rms_energy": avg_rms
-                },
-                "speaking_rate": {
-                    "words_per_minute": speaking_rate,
-                    "duration_seconds": duration
-                },
-                "feedback": prosody_feedback
             }
-        except Exception as e:
-            print(f"Prosody analysis error: {e}")
-            return {
-                "error": f"Prosody analysis failed: {str(e)}",
-                "pitch": {"average": 0, "variability": 0},
-                "rhythm": {"zero_crossing_rate": 0},
-                "intensity": {"rms_energy": 0},
-                "speaking_rate": {"words_per_minute": 0, "duration_seconds": 0},
-                "feedback": ["Prosody analysis unavailable"]
             }

+from typing import List, Dict, Tuple, Optional
 import numpy as np
 import librosa
 import nltk
 import eng_to_ipa as ipa
 import re
 from collections import defaultdict
 from loguru import logger
 import time
+import Levenshtein
+from dataclasses import dataclass
+from enum import Enum
 from src.AI_Models.wave2vec_inference import (
     Wave2Vec2Inference,
     Wave2Vec2ONNXInference,
+    OptimizedWave2Vec2Factory,
     export_to_onnx,
 )
     print("Warning: NLTK data not available")
+class AssessmentMode(Enum):
+    WORD = "word"
+    SENTENCE = "sentence"
+    AUTO = "auto"
+class ErrorType(Enum):
+    CORRECT = "correct"
+    SUBSTITUTION = "substitution"
+    DELETION = "deletion"
+    INSERTION = "insertion"
+    ACCEPTABLE = "acceptable"
+@dataclass
+class CharacterError:
+    """Character-level error information for UI mapping"""
+    character: str
+    position: int
+    error_type: str
+    expected_sound: str
+    actual_sound: str
+    severity: float
+    color: str
+class EnhancedWav2Vec2CharacterASR:
+    """Enhanced Wav2Vec2 ASR with prosody analysis support"""
     def __init__(
         self,
         onnx: bool = False,
         quantized: bool = False,
     ):
         self.use_onnx = onnx
         self.sample_rate = 16000
         self.model_name = model_name
         if onnx:
             import os
+            model_path = f"wav2vec2-large-960h-lv60-self{'.quant' if quantized else ''}.onnx"
+            if not os.path.exists(model_path):
                 export_to_onnx(model_name, quantize=quantized)
+        # Use factory to create safe inference instance
+        self.model = OptimizedWave2Vec2Factory.create_optimized_inference(
+            model_name,
+            onnx_path=model_path if onnx else None,
+            safe_mode=True  # Use safe mode to avoid optimization issues
         )
+    def transcribe_with_features(self, audio_path: str) -> Dict:
+        """Enhanced transcription with audio features for prosody analysis"""
         try:
             start_time = time.time()
+            # Basic transcription
             character_transcript = self.model.file_to_text(audio_path)
+            character_transcript = self._clean_character_transcript(character_transcript)
+            # Convert to phonemes
+            phoneme_representation = self._characters_to_phoneme_representation(character_transcript)
+            # Extract audio features for prosody
+            audio_features = self._extract_enhanced_audio_features(audio_path)
+            logger.info(f"Enhanced transcription time: {time.time() - start_time:.2f}s")
             return {
                 "character_transcript": character_transcript,
+                "phoneme_representation": phoneme_representation,
+                "audio_features": audio_features,
+                "confidence": self._estimate_confidence(character_transcript)
             }
         except Exception as e:
+            logger.error(f"Enhanced ASR error: {e}")
             return self._empty_result()
+    def _extract_enhanced_audio_features(self, audio_path: str) -> Dict:
+        """Extract comprehensive audio features for prosody analysis"""
+        try:
+            y, sr = librosa.load(audio_path, sr=self.sample_rate)
+            duration = len(y) / sr
+            # Pitch analysis
+            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+            pitch_values = []
+            for t in range(pitches.shape[1]):
+                index = magnitudes[:, t].argmax()
+                pitch = pitches[index, t]
+                if pitch > 0:
+                    pitch_values.append(pitch)
+            # Rhythm and timing features
+            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
+            # Intensity features
+            rms = librosa.feature.rms(y=y)[0]
+            zcr = librosa.feature.zero_crossing_rate(y)[0]
+            # Spectral features
+            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
+            return {
+                "duration": duration,
+                "pitch": {
+                    "values": pitch_values,
+                    "mean": np.mean(pitch_values) if pitch_values else 0,
+                    "std": np.std(pitch_values) if pitch_values else 0,
+                    "range": np.max(pitch_values) - np.min(pitch_values) if pitch_values else 0,
+                    "cv": np.std(pitch_values) / np.mean(pitch_values) if pitch_values and np.mean(pitch_values) > 0 else 0
+                },
+                "rhythm": {
+                    "tempo": tempo,
+                    "beats_per_second": len(beats) / duration if duration > 0 else 0
+                },
+                "intensity": {
+                    "rms_mean": np.mean(rms),
+                    "rms_std": np.std(rms),
+                    "zcr_mean": np.mean(zcr)
+                },
+                "spectral": {
+                    "centroid_mean": np.mean(spectral_centroids),
+                    "centroid_std": np.std(spectral_centroids)
+                }
+            }
+        except Exception as e:
+            logger.error(f"Audio feature extraction error: {e}")
+            return {"duration": 0, "error": str(e)}
     def _clean_character_transcript(self, transcript: str) -> str:
         """Clean and standardize character transcript"""
         logger.info(f"Raw transcript before cleaning: {transcript}")
+        cleaned = re.sub(r'\s+', ' ', transcript)
+        return cleaned.strip().lower()
     def _characters_to_phoneme_representation(self, text: str) -> str:
+        """Convert character-based transcript to phoneme representation"""
         if not text:
             return ""
         words = text.split()
         phoneme_words = []
+        g2p = EnhancedG2P()
         for word in words:
             try:
                 if g2p:
+                    word_phonemes = g2p.word_to_phonemes(word)
+                    phoneme_words.extend(word_phonemes)
                 else:
                     phoneme_words.extend(self._simple_letter_to_phoneme(word))
             except:
                 phoneme_words.extend(self._simple_letter_to_phoneme(word))
         return " ".join(phoneme_words)
     def _simple_letter_to_phoneme(self, word: str) -> List[str]:
+        """Fallback letter-to-phoneme conversion"""
         letter_to_phoneme = {
+            "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f",
+            "g": "ɡ", "h": "h", "i": "ɪ", "j": "dʒ", "k": "k", "l": "l",
+            "m": "m", "n": "n", "o": "ʌ", "p": "p", "q": "k", "r": "r",
+            "s": "s", "t": "t", "u": "ʌ", "v": "v", "w": "w", "x": "ks",
+            "y": "j", "z": "z"
         }
+        return [letter_to_phoneme.get(letter, letter) for letter in word.lower() if letter in letter_to_phoneme]
+    def _estimate_confidence(self, transcript: str) -> float:
+        """Estimate transcription confidence"""
+        if not transcript or len(transcript.strip()) < 2:
+            return 0.0
+        repeated_chars = len(re.findall(r'(.)\1{2,}', transcript))
+        return max(0.0, 1.0 - (repeated_chars * 0.2))
     def _empty_result(self) -> Dict:
+        """Empty result for error cases"""
         return {
             "character_transcript": "",
             "phoneme_representation": "",
+            "audio_features": {"duration": 0},
+            "confidence": 0.0
         }
+class EnhancedG2P:
+    """Enhanced Grapheme-to-Phoneme converter with visualization support"""
     def __init__(self):
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
+            logger.warning("CMU dictionary not available")
+        # Vietnamese speaker substitution patterns (enhanced)
+        self.vn_substitutions = {
+            "θ": ["f", "s", "t", "d"],
+            "ð": ["d", "z", "v", "t"],
+            "v": ["w", "f", "b"],
+            "w": ["v", "b"],
+            "r": ["l", "n"],
+            "l": ["r", "n"],
+            "z": ["s", "j"],
+            "ʒ": ["ʃ", "z", "s"],
+            "ʃ": ["s", "ʒ"],
+            "ŋ": ["n", "m"],
+            "tʃ": ["ʃ", "s", "k"],
+            "dʒ": ["ʒ", "j", "g"],
+            "æ": ["ɛ", "a"],
+            "ɪ": ["i"],
+            "ʊ": ["u"]
+        }
+        # Difficulty scores for Vietnamese speakers
+        self.difficulty_scores = {
+            "θ": 0.9, "ð": 0.9, "v": 0.8, "z": 0.8, "ʒ": 0.9,
+            "r": 0.7, "l": 0.6, "w": 0.5, "æ": 0.7, "ɪ": 0.6,
+            "ʊ": 0.6, "ŋ": 0.3, "f": 0.2, "s": 0.2, "ʃ": 0.5,
+            "tʃ": 0.4, "dʒ": 0.5
+        }
+    def word_to_phonemes(self, word: str) -> List[str]:
+        """Convert word to phoneme list"""
+        word_lower = word.lower().strip()
+        if word_lower in self.cmu_dict:
+            cmu_phonemes = self.cmu_dict[word_lower][0]
+            return self._convert_cmu_to_ipa(cmu_phonemes)
+        else:
+            return self._estimate_phonemes(word_lower)
+    def get_phoneme_string(self, text: str) -> str:
+        """Get space-separated phoneme string"""
+        words = self._clean_text(text).split()
+        all_phonemes = []
+        for word in words:
+            if word:
+                phonemes = self.word_to_phonemes(word)
+                all_phonemes.extend(phonemes)
+        return " ".join(all_phonemes)
     def text_to_phonemes(self, text: str) -> List[Dict]:
+        """Convert text to phoneme sequence with visualization data"""
         words = self._clean_text(text).split()
         phoneme_sequence = []
         for word in words:
+            word_phonemes = self.word_to_phonemes(word)
+            phoneme_sequence.append({
+                "word": word,
+                "phonemes": word_phonemes,
+                "ipa": self._get_ipa(word),
+                "phoneme_string": " ".join(word_phonemes),
+                "visualization": self._create_phoneme_visualization(word_phonemes)
+            })
         return phoneme_sequence
+    def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
+        """Convert CMU phonemes to IPA"""
+        cmu_to_ipa = {
+            "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ",
+            "AY": "aɪ", "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ",
+            "IY": "i", "OW": "oʊ", "OY": "ɔɪ", "UH": "ʊ", "UW": "u",
+            "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "F": "f",
+            "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l",
+            "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
+            "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v",
+            "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ"
         }
+        ipa_phonemes = []
         for phoneme in cmu_phonemes:
+            clean_phoneme = re.sub(r'[0-9]', '', phoneme)
+            ipa_phoneme = cmu_to_ipa.get(clean_phoneme, clean_phoneme.lower())
+            ipa_phonemes.append(ipa_phoneme)
+        return ipa_phonemes
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
         phoneme_map = {
+            "ch": "tʃ", "sh": "ʃ", "th": "θ", "ph": "f", "ck": "k",
+            "ng": "ŋ", "qu": "kw", "a": "æ", "e": "ɛ", "i": "ɪ",
+            "o": "ʌ", "u": "ʌ", "b": "b", "c": "k", "d": "d",
+            "f": "f", "g": "ɡ", "h": "h", "j": "dʒ", "k": "k",
+            "l": "l", "m": "m", "n": "n", "p": "p", "r": "r",
+            "s": "s", "t": "t", "v": "v", "w": "w", "x": "ks",
+            "y": "j", "z": "z"
         }
         phonemes = []
         i = 0
         while i < len(word):
             if i <= len(word) - 2:
+                two_char = word[i:i+2]
                 if two_char in phoneme_map:
+                    phonemes.append(phoneme_map[two_char])
                     i += 2
                     continue
             char = word[i]
             if char in phoneme_map:
+                phonemes.append(phoneme_map[char])
             i += 1
         return phonemes
+    def _clean_text(self, text: str) -> str:
+        """Clean text for processing"""
+        text = re.sub(r"[^\w\s']", " ", text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.lower().strip()
+    def _get_ipa(self, word: str) -> str:
+        """Get IPA transcription"""
+        try:
+            return ipa.convert(word)
+        except:
+            return f"/{word}/"
     def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
         """Create visualization data for phonemes"""
         visualization = []
         for phoneme in phonemes:
             color_category = self._get_phoneme_color_category(phoneme)
             visualization.append({
                 "phoneme": phoneme,
                 "color_category": color_category,
+                "description": self._get_phoneme_description(phoneme),
+                "difficulty": self.difficulty_scores.get(phoneme, 0.3)
             })
         return visualization
     def _get_phoneme_color_category(self, phoneme: str) -> str:
         """Categorize phonemes by color for visualization"""
         vowel_phonemes = {"ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u"}
+        difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
         if phoneme in vowel_phonemes:
             return "vowel"
+        elif phoneme in difficult_consonants:
+            return "difficult"
         else:
+            return "consonant"
     def _get_phoneme_description(self, phoneme: str) -> str:
         """Get description for a phoneme"""
         descriptions = {
             "θ": "Voiceless dental fricative (like 'th' in 'think')",
             "ð": "Voiced dental fricative (like 'th' in 'this')",
+            "v": "Voiced labiodental fricative (like 'v' in 'van')",
             "z": "Voiced alveolar fricative (like 'z' in 'zip')",
             "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
             "r": "Alveolar approximant (like 'r' in 'red')",
             "w": "Labial-velar approximant (like 'w' in 'wet')",
+            "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
+            "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
+            "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')"
         }
         return descriptions.get(phoneme, f"Phoneme: {phoneme}")
+    def is_acceptable_substitution(self, reference: str, predicted: str) -> bool:
+        """Check if substitution is acceptable for Vietnamese speakers"""
+        acceptable = self.vn_substitutions.get(reference, [])
+        return predicted in acceptable
+    def get_difficulty_score(self, phoneme: str) -> float:
+        """Get difficulty score for phoneme"""
+        return self.difficulty_scores.get(phoneme, 0.3)
+class AdvancedPhonemeComparator:
+    """Enhanced phoneme comparator using Levenshtein distance"""
+    def __init__(self):
+        self.g2p = EnhancedG2P()
+    def compare_with_levenshtein(self, reference: str, predicted: str) -> List[Dict]:
+        """Compare phonemes using Levenshtein distance for accurate alignment"""
+        ref_phones = reference.split() if reference else []
+        pred_phones = predicted.split() if predicted else []
+        if not ref_phones:
+            return []
+        # Use Levenshtein editops for precise alignment
+        ops = Levenshtein.editops(ref_phones, pred_phones)
         comparisons = []
+        ref_idx = 0
+        pred_idx = 0
+        # Process equal parts first
+        for op_type, ref_pos, pred_pos in ops:
+            # Add equal characters before this operation
+            while ref_idx < ref_pos and pred_idx < pred_pos:
+                comparison = self._create_comparison(
+                    ref_phones[ref_idx], pred_phones[pred_idx],
+                    ErrorType.CORRECT, 1.0, len(comparisons)
+                )
+                comparisons.append(comparison)
+                ref_idx += 1
+                pred_idx += 1
+            # Process the operation
+            if op_type == 'replace':
+                ref_phoneme = ref_phones[ref_pos]
+                pred_phoneme = pred_phones[pred_pos]
+                if self.g2p.is_acceptable_substitution(ref_phoneme, pred_phoneme):
+                    error_type = ErrorType.ACCEPTABLE
                     score = 0.7
                 else:
+                    error_type = ErrorType.SUBSTITUTION
                     score = 0.2
+                comparison = self._create_comparison(
+                    ref_phoneme, pred_phoneme, error_type, score, len(comparisons)
+                )
+                comparisons.append(comparison)
+                ref_idx = ref_pos + 1
+                pred_idx = pred_pos + 1
+            elif op_type == 'delete':
+                comparison = self._create_comparison(
+                    ref_phones[ref_pos], "", ErrorType.DELETION, 0.0, len(comparisons)
+                )
+                comparisons.append(comparison)
+                ref_idx = ref_pos + 1
+            elif op_type == 'insert':
+                comparison = self._create_comparison(
+                    "", pred_phones[pred_pos], ErrorType.INSERTION, 0.0, len(comparisons)
+                )
+                comparisons.append(comparison)
+                pred_idx = pred_pos + 1
+        # Add remaining equal characters
+        while ref_idx < len(ref_phones) and pred_idx < len(pred_phones):
+            comparison = self._create_comparison(
+                ref_phones[ref_idx], pred_phones[pred_idx],
+                ErrorType.CORRECT, 1.0, len(comparisons)
+            )
             comparisons.append(comparison)
+            ref_idx += 1
+            pred_idx += 1
         return comparisons
+    def _create_comparison(self, ref_phoneme: str, pred_phoneme: str,
+                          error_type: ErrorType, score: float, position: int) -> Dict:
+        """Create comparison dictionary"""
+        return {
+            "position": position,
+            "reference_phoneme": ref_phoneme,
+            "learner_phoneme": pred_phoneme,
+            "status": error_type.value,
+            "score": score,
+            "difficulty": self.g2p.get_difficulty_score(ref_phoneme),
+            "error_type": error_type.value
+        }
+class EnhancedWordAnalyzer:
+    """Enhanced word analyzer with character-level error mapping"""
     def __init__(self):
+        self.g2p = EnhancedG2P()
+        self.comparator = AdvancedPhonemeComparator()
+    def analyze_words_enhanced(self, reference_text: str, learner_phonemes: str,
+                             mode: AssessmentMode) -> Dict:
+        """Enhanced word analysis with character-level mapping"""
         # Get reference phonemes by word
         reference_words = self.g2p.text_to_phonemes(reference_text)
+        # Get overall phoneme comparison using Levenshtein
+        reference_phoneme_string = self.g2p.get_phoneme_string(reference_text)
+        phoneme_comparisons = self.comparator.compare_with_levenshtein(
             reference_phoneme_string, learner_phonemes
         )
+        # Create enhanced word highlights
+        word_highlights = self._create_enhanced_word_highlights(
+            reference_words, phoneme_comparisons, mode
         )
+        # Identify wrong words with character-level errors
+        wrong_words = self._identify_wrong_words_enhanced(word_highlights, phoneme_comparisons)
         return {
             "word_highlights": word_highlights,
             "phoneme_differences": phoneme_comparisons,
             "wrong_words": wrong_words,
+            "reference_phonemes": reference_phoneme_string,
+            "phoneme_pairs": self._create_phoneme_pairs(reference_phoneme_string, learner_phonemes)
         }
+    def _create_enhanced_word_highlights(self, reference_words: List[Dict],
+                                       phoneme_comparisons: List[Dict],
+                                       mode: AssessmentMode) -> List[Dict]:
+        """Create enhanced word highlights with character-level error mapping"""
         word_highlights = []
         phoneme_index = 0
             # Get phoneme scores for this word
             word_phoneme_scores = []
+            word_comparisons = []
             for j in range(num_phonemes):
                 if phoneme_index + j < len(phoneme_comparisons):
                     comparison = phoneme_comparisons[phoneme_index + j]
                     word_phoneme_scores.append(comparison["score"])
+                    word_comparisons.append(comparison)
             # Calculate word score
             word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
+            # Map phoneme errors to character positions (enhanced for word mode)
+            character_errors = []
+            if mode == AssessmentMode.WORD:
+                character_errors = self._map_phonemes_to_characters(word, word_comparisons)
+            # Create enhanced word highlight
             highlight = {
                 "word": word,
                 "score": float(word_score),
                 "phoneme_scores": word_phoneme_scores,
                 "phoneme_start_index": phoneme_index,
                 "phoneme_end_index": phoneme_index + num_phonemes - 1,
+                "phoneme_visualization": word_data["visualization"],
+                "character_errors": character_errors,  # New feature
+                "detailed_analysis": mode == AssessmentMode.WORD  # Flag for UI
             }
             word_highlights.append(highlight)
         return word_highlights
+    def _map_phonemes_to_characters(self, word: str, phoneme_comparisons: List[Dict]) -> List[CharacterError]:
+        """Map phoneme errors to character positions in word"""
+        character_errors = []
+        # Simple mapping strategy: distribute phonemes across characters
+        if not phoneme_comparisons or not word:
+            return character_errors
+        chars_per_phoneme = len(word) / len(phoneme_comparisons)
+        for i, comparison in enumerate(phoneme_comparisons):
+            if comparison["status"] in ["substitution", "deletion", "wrong"]:
+                # Calculate character position
+                char_pos = min(int(i * chars_per_phoneme), len(word) - 1)
+                severity = 1.0 - comparison["score"]
+                color = self._get_error_color(severity)
+                error = CharacterError(
+                    character=word[char_pos],
+                    position=char_pos,
+                    error_type=comparison["status"],
+                    expected_sound=comparison["reference_phoneme"],
+                    actual_sound=comparison["learner_phoneme"],
+                    severity=severity,
+                    color=color
+                )
+                character_errors.append(error)
+        return character_errors
+    def _get_error_color(self, severity: float) -> str:
+        """Get color code for character errors"""
+        if severity >= 0.8:
+            return "#ef4444"  # Red - severe error
+        elif severity >= 0.6:
+            return "#f97316"  # Orange - moderate error
+        elif severity >= 0.4:
+            return "#eab308"  # Yellow - mild error
+        else:
+            return "#84cc16"  # Light green - minor error
+    def _identify_wrong_words_enhanced(self, word_highlights: List[Dict],
+                                     phoneme_comparisons: List[Dict]) -> List[Dict]:
+        """Enhanced wrong word identification with detailed error analysis"""
         wrong_words = []
         for word_highlight in word_highlights:
+            if word_highlight["score"] < 0.6:
                 start_idx = word_highlight["phoneme_start_index"]
                 end_idx = word_highlight["phoneme_end_index"]
                 for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
                     comparison = phoneme_comparisons[i]
+                    if comparison["status"] in ["wrong", "substitution"]:
+                        wrong_phonemes.append({
+                            "expected": comparison["reference_phoneme"],
+                            "actual": comparison["learner_phoneme"],
+                            "difficulty": comparison["difficulty"],
+                            "description": self.g2p._get_phoneme_description(comparison["reference_phoneme"])
+                        })
+                    elif comparison["status"] in ["missing", "deletion"]:
+                        missing_phonemes.append({
+                            "phoneme": comparison["reference_phoneme"],
+                            "difficulty": comparison["difficulty"],
+                            "description": self.g2p._get_phoneme_description(comparison["reference_phoneme"])
+                        })
                 wrong_word = {
                     "word": word_highlight["word"],
                     "ipa": word_highlight["ipa"],
                     "wrong_phonemes": wrong_phonemes,
                     "missing_phonemes": missing_phonemes,
+                    "tips": self._get_enhanced_vietnamese_tips(wrong_phonemes, missing_phonemes),
+                    "phoneme_visualization": word_highlight["phoneme_visualization"],
+                    "character_errors": word_highlight.get("character_errors", [])
                 }
                 wrong_words.append(wrong_word)
         return wrong_words
+    def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
+        """Create phoneme pairs for visualization"""
+        ref_phones = reference.split() if reference else []
+        learner_phones = learner.split() if learner else []
+        # Use difflib for alignment visualization
+        import difflib
+        matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
+        pairs = []
+        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+            if tag == 'equal':
+                for k in range(i2 - i1):
+                    pairs.append({
+                        "reference": ref_phones[i1 + k],
+                        "learner": learner_phones[j1 + k],
+                        "match": True,
+                        "type": "correct"
+                    })
+            elif tag == 'replace':
+                max_len = max(i2 - i1, j2 - j1)
+                for k in range(max_len):
+                    ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
+                    learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
+                    pairs.append({
+                        "reference": ref_phoneme,
+                        "learner": learner_phoneme,
+                        "match": False,
+                        "type": "substitution"
+                    })
+            elif tag == 'delete':
+                for k in range(i1, i2):
+                    pairs.append({
+                        "reference": ref_phones[k],
+                        "learner": "",
+                        "match": False,
+                        "type": "deletion"
+                    })
+            elif tag == 'insert':
+                for k in range(j1, j2):
+                    pairs.append({
+                        "reference": "",
+                        "learner": learner_phones[k],
+                        "match": False,
+                        "type": "insertion"
+                    })
+        return pairs
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
         else:
             return "#ef4444"  # Red
+    def _get_enhanced_vietnamese_tips(self, wrong_phonemes: List[Dict],
+                                    missing_phonemes: List[Dict]) -> List[str]:
+        """Enhanced Vietnamese-specific pronunciation tips"""
         tips = []
         vietnamese_tips = {
             "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
             "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
             "z": "Giống âm 's' nhưng có rung dây thanh âm",
             "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
             "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
+            "æ": "Mở miệng rộng hơn khi phát âm 'a'",
+            "ɪ": "Âm 'i' ngắn, không kéo dài như tiếng Việt"
         }
         for wrong in wrong_phonemes:
             expected = wrong["expected"]
             if expected in vietnamese_tips:
+                tips.append(f"Âm /{expected}/: {vietnamese_tips[expected]}")
         for missing in missing_phonemes:
             phoneme = missing["phoneme"]
             if phoneme in vietnamese_tips:
+                tips.append(f"Thiếu âm /{phoneme}/: {vietnamese_tips[phoneme]}")
         return tips
+class EnhancedProsodyAnalyzer:
+    """Enhanced prosody analyzer for sentence-level assessment"""
+    def __init__(self):
+        # Expected values for English prosody
+        self.expected_speech_rate = 4.0  # syllables per second
+        self.expected_pitch_range = 100  # Hz
+        self.expected_pitch_cv = 0.3  # coefficient of variation
+    def analyze_prosody_enhanced(self, audio_features: Dict, reference_text: str) -> Dict:
+        """Enhanced prosody analysis with detailed scoring"""
+        if "error" in audio_features:
+            return self._empty_prosody_result()
+        duration = audio_features.get("duration", 1)
+        pitch_data = audio_features.get("pitch", {})
+        rhythm_data = audio_features.get("rhythm", {})
+        intensity_data = audio_features.get("intensity", {})
+        # Calculate syllables
+        num_syllables = self._estimate_syllables(reference_text)
+        actual_speech_rate = num_syllables / duration if duration > 0 else 0
+        # Calculate individual prosody scores
+        pace_score = self._calculate_pace_score(actual_speech_rate)
+        intonation_score = self._calculate_intonation_score(pitch_data)
+        rhythm_score = self._calculate_rhythm_score(rhythm_data, intensity_data)
+        stress_score = self._calculate_stress_score(pitch_data, intensity_data)
+        # Overall prosody score
+        overall_prosody = (pace_score + intonation_score + rhythm_score + stress_score) / 4
+        # Generate prosody feedback
+        feedback = self._generate_prosody_feedback(
+            pace_score, intonation_score, rhythm_score, stress_score,
+            actual_speech_rate, pitch_data
+        )
+        return {
+            "pace_score": pace_score,
+            "intonation_score": intonation_score,
+            "rhythm_score": rhythm_score,
+            "stress_score": stress_score,
+            "overall_prosody": overall_prosody,
+            "details": {
+                "speech_rate": actual_speech_rate,
+                "expected_speech_rate": self.expected_speech_rate,
+                "syllable_count": num_syllables,
+                "duration": duration,
+                "pitch_analysis": pitch_data,
+                "rhythm_analysis": rhythm_data,
+                "intensity_analysis": intensity_data
+            },
+            "feedback": feedback
+        }
+    def _calculate_pace_score(self, actual_rate: float) -> float:
+        """Calculate pace score based on speech rate"""
+        if self.expected_speech_rate == 0:
+            return 0.5
+        ratio = actual_rate / self.expected_speech_rate
+        if 0.8 <= ratio <= 1.2:
+            return 1.0
+        elif 0.6 <= ratio < 0.8 or 1.2 < ratio <= 1.5:
+            return 0.7
+        elif 0.4 <= ratio < 0.6 or 1.5 < ratio <= 2.0:
+            return 0.4
         else:
+            return 0.1
+    def _calculate_intonation_score(self, pitch_data: Dict) -> float:
+        """Calculate intonation score based on pitch variation"""
+        pitch_range = pitch_data.get("range", 0)
+        if self.expected_pitch_range == 0:
+            return 0.5
+        ratio = pitch_range / self.expected_pitch_range
+        if 0.7 <= ratio <= 1.3:
+            return 1.0
+        elif 0.5 <= ratio < 0.7 or 1.3 < ratio <= 1.8:
+            return 0.7
+        elif 0.3 <= ratio < 0.5 or 1.8 < ratio <= 2.5:
+            return 0.4
+        else:
+            return 0.2
+    def _calculate_rhythm_score(self, rhythm_data: Dict, intensity_data: Dict) -> float:
+        """Calculate rhythm score based on tempo and intensity patterns"""
+        tempo = rhythm_data.get("tempo", 120)
+        intensity_std = intensity_data.get("rms_std", 0)
+        intensity_mean = intensity_data.get("rms_mean", 0)
+        # Tempo score (60-180 BPM is good for speech)
+        if 60 <= tempo <= 180:
+            tempo_score = 1.0
+        elif 40 <= tempo < 60 or 180 < tempo <= 220:
+            tempo_score = 0.6
+        else:
+            tempo_score = 0.3
+        # Intensity consistency score
+        if intensity_mean > 0:
+            intensity_consistency = max(0, 1.0 - (intensity_std / intensity_mean))
+        else:
+            intensity_consistency = 0.5
+        return (tempo_score + intensity_consistency) / 2
+    def _calculate_stress_score(self, pitch_data: Dict, intensity_data: Dict) -> float:
+        """Calculate stress score based on pitch and intensity variation"""
+        pitch_cv = pitch_data.get("cv", 0)
+        intensity_std = intensity_data.get("rms_std", 0)
+        intensity_mean = intensity_data.get("rms_mean", 0)
+        # Pitch coefficient of variation score
+        if 0.2 <= pitch_cv <= 0.4:
+            pitch_score = 1.0
+        elif 0.1 <= pitch_cv < 0.2 or 0.4 < pitch_cv <= 0.6:
+            pitch_score = 0.7
+        else:
+            pitch_score = 0.4
+        # Intensity variation score
+        if intensity_mean > 0:
+            intensity_cv = intensity_std / intensity_mean
+            if 0.1 <= intensity_cv <= 0.3:
+                intensity_score = 1.0
+            elif 0.05 <= intensity_cv < 0.1 or 0.3 < intensity_cv <= 0.5:
+                intensity_score = 0.7
             else:
+                intensity_score = 0.4
+        else:
+            intensity_score = 0.5
+        return (pitch_score + intensity_score) / 2
+    def _generate_prosody_feedback(self, pace_score: float, intonation_score: float,
+                                 rhythm_score: float, stress_score: float,
+                                 speech_rate: float, pitch_data: Dict) -> List[str]:
+        """Generate detailed prosody feedback"""
+        feedback = []
+        if pace_score < 0.5:
+            if speech_rate < self.expected_speech_rate * 0.8:
+                feedback.append("Tốc độ nói hơi chậm, thử nói nhanh hơn một chút")
+            else:
+                feedback.append("Tốc độ nói hơi nhanh, thử nói chậm lại để rõ ràng hơn")
+        elif pace_score >= 0.8:
+            feedback.append("Tốc độ nói rất tự nhiên")
+        if intonation_score < 0.5:
+            feedback.append("Cần cải thiện ngữ điệu - thay đổi cao độ giọng nhiều hơn")
+        elif intonation_score >= 0.8:
+            feedback.append("Ngữ điệu rất tự nhiên và sinh động")
+        if rhythm_score < 0.5:
+            feedback.append("Nhịp điệu cần đều hơn - chú ý đến trọng âm của từ")
+        elif rhythm_score >= 0.8:
+            feedback.append("Nhịp điệu rất tốt")
+        if stress_score < 0.5:
+            feedback.append("Cần nhấn mạnh trọng âm rõ ràng hơn")
+        elif stress_score >= 0.8:
+            feedback.append("Trọng âm được nhấn rất tốt")
+        return feedback
+    def _estimate_syllables(self, text: str) -> int:
+        """Estimate number of syllables in text"""
+        vowels = "aeiouy"
+        text = text.lower()
+        syllable_count = 0
+        prev_was_vowel = False
+        for char in text:
+            if char in vowels:
+                if not prev_was_vowel:
+                    syllable_count += 1
+                prev_was_vowel = True
+            else:
+                prev_was_vowel = False
+        if text.endswith('e'):
+            syllable_count -= 1
+        return max(1, syllable_count)
+    def _empty_prosody_result(self) -> Dict:
+        """Return empty prosody result for error cases"""
+        return {
+            "pace_score": 0.5,
+            "intonation_score": 0.5,
+            "rhythm_score": 0.5,
+            "stress_score": 0.5,
+            "overall_prosody": 0.5,
+            "details": {},
+            "feedback": ["Không thể phân tích ngữ điệu"]
+        }
+class EnhancedFeedbackGenerator:
+    """Enhanced feedback generator with detailed analysis"""
+    def generate_enhanced_feedback(self, overall_score: float, wrong_words: List[Dict],
+                                 phoneme_comparisons: List[Dict], mode: AssessmentMode,
+                                 prosody_analysis: Dict = None) -> List[str]:
+        """Generate comprehensive feedback based on assessment mode"""
+        feedback = []
+        # Overall score feedback
+        if overall_score >= 0.9:
+            feedback.append("Phát âm xuất sắc! Bạn đã làm rất tốt.")
+        elif overall_score >= 0.8:
+            feedback.append("Phát âm rất tốt! Chỉ còn một vài điểm nhỏ cần cải thiện.")
+        elif overall_score >= 0.6:
+            feedback.append("Phát âm khá tốt, còn một số điểm cần luyện tập thêm.")
+        elif overall_score >= 0.4:
+            feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu.")
+        else:
+            feedback.append("Hãy luyện tập chậm rãi và rõ ràng hơn.")
+        # Mode-specific feedback
+        if mode == AssessmentMode.WORD:
+            feedback.extend(self._generate_word_mode_feedback(wrong_words, phoneme_comparisons))
+        elif mode == AssessmentMode.SENTENCE:
+            feedback.extend(self._generate_sentence_mode_feedback(wrong_words, prosody_analysis))
+        # Common error patterns
+        error_patterns = self._analyze_error_patterns(phoneme_comparisons)
+        if error_patterns:
+            feedback.extend(error_patterns)
+        return feedback
+    def _generate_word_mode_feedback(self, wrong_words: List[Dict],
+                                   phoneme_comparisons: List[Dict]) -> List[str]:
+        """Generate feedback specific to word mode"""
+        feedback = []
+        if wrong_words:
+            if len(wrong_words) == 1:
+                word = wrong_words[0]["word"]
+                feedback.append(f"Từ '{word}' cần luyện tập thêm")
+                # Character-level feedback
+                char_errors = wrong_words[0].get("character_errors", [])
+                if char_errors:
+                    error_chars = [err.character for err in char_errors[:3]]
+                    feedback.append(f"Chú ý các âm: {', '.join(error_chars)}")
+            else:
+                word_list = [w["word"] for w in wrong_words[:3]]
+                feedback.append(f"Các từ cần luyện: {', '.join(word_list)}")
+        return feedback
+    def _generate_sentence_mode_feedback(self, wrong_words: List[Dict],
+                                       prosody_analysis: Dict) -> List[str]:
+        """Generate feedback specific to sentence mode"""
+        feedback = []
+        # Word-level feedback
+        if wrong_words:
+            if len(wrong_words) <= 2:
+                word_list = [w["word"] for w in wrong_words]
+                feedback.append(f"Cần cải thiện: {', '.join(word_list)}")
+            else:
+                feedback.append(f"Có {len(wrong_words)} từ cần luyện tập")
+        # Prosody feedback
+        if prosody_analysis and "feedback" in prosody_analysis:
+            feedback.extend(prosody_analysis["feedback"][:2])  # Limit prosody feedback
+        return feedback
+    def _analyze_error_patterns(self, phoneme_comparisons: List[Dict]) -> List[str]:
+        """Analyze common error patterns across phonemes"""
+        feedback = []
+        # Count error types
+        error_counts = defaultdict(int)
+        difficult_phonemes = defaultdict(int)
+        for comparison in phoneme_comparisons:
+            if comparison["status"] in ["wrong", "substitution"]:
+                phoneme = comparison["reference_phoneme"]
+                difficult_phonemes[phoneme] += 1
+                error_counts[comparison["status"]] += 1
+        # Most problematic phoneme
+        if difficult_phonemes:
+            most_difficult = max(difficult_phonemes.items(), key=lambda x: x[1])
+            if most_difficult[1] >= 2:
+                phoneme = most_difficult[0]
+                phoneme_tips = {
+                    "θ": "Lưỡi giữa răng, thổi nhẹ",
+                    "ð": "Lưỡi giữa răng, rung dây thanh",
+                    "v": "Môi dưới chạm răng trên",
+                    "r": "Cuộn lưỡi nhẹ",
+                    "z": "Như 's' nhưng rung dây thanh"
+                }
+                if phoneme in phoneme_tips:
+                    feedback.append(f"Âm khó nhất /{phoneme}/: {phoneme_tips[phoneme]}")
+        return feedback
+class ProductionPronunciationAssessor:
+    """Production-ready pronunciation assessor - Enhanced version with singleton pattern"""
+    _instance = None
+    _initialized = False
+    def __new__(cls, onnx: bool = False, quantized: bool = False):
+        if cls._instance is None:
+            cls._instance = super(ProductionPronunciationAssessor, cls).__new__(cls)
+        return cls._instance
+    def __init__(self, onnx: bool = False, quantized: bool = False):
+        """Initialize the production-ready pronunciation assessment system (only once)"""
+        if self._initialized:
+            return
+        logger.info("Initializing Production Pronunciation Assessment System...")
+        self.asr = EnhancedWav2Vec2CharacterASR(onnx=onnx, quantized=quantized)
+        self.word_analyzer = EnhancedWordAnalyzer()
+        self.prosody_analyzer = EnhancedProsodyAnalyzer()
+        self.feedback_generator = EnhancedFeedbackGenerator()
+        self.g2p = EnhancedG2P()
+        ProductionPronunciationAssessor._initialized = True
+        logger.info("Production system initialization completed")
+    def assess_pronunciation(self, audio_path: str, reference_text: str,
+                           mode: str = "auto") -> Dict:
         """
+        Main assessment function with enhanced features
         Args:
             audio_path: Path to audio file
+            reference_text: Reference text to compare against
+            mode: Assessment mode ("word", "sentence", "auto", or legacy modes)
         Returns:
+            Enhanced assessment results with backward compatibility
         """
+        logger.info(f"Starting production assessment in {mode} mode...")
+        start_time = time.time()
+        try:
+            # Normalize and validate mode
+            assessment_mode = self._normalize_mode(mode, reference_text)
+            logger.info(f"Using assessment mode: {assessment_mode.value}")
+            # Step 1: Enhanced ASR transcription with features
+            asr_result = self.asr.transcribe_with_features(audio_path)
+            if not asr_result["character_transcript"]:
+                return self._create_error_result("No speech detected in audio")
+            # Step 2: Enhanced word analysis
+            analysis_result = self.word_analyzer.analyze_words_enhanced(
+                reference_text,
+                asr_result["phoneme_representation"],
+                assessment_mode
+            )
+            # Step 3: Calculate overall score
+            overall_score = self._calculate_overall_score(analysis_result["phoneme_differences"])
+            # Step 4: Prosody analysis for sentence mode
+            prosody_analysis = {}
+            if assessment_mode == AssessmentMode.SENTENCE:
+                prosody_analysis = self.prosody_analyzer.analyze_prosody_enhanced(
+                    asr_result["audio_features"],
+                    reference_text
+                )
+            # Step 5: Generate enhanced feedback
+            feedback = self.feedback_generator.generate_enhanced_feedback(
+                overall_score,
+                analysis_result["wrong_words"],
+                analysis_result["phoneme_differences"],
+                assessment_mode,
+                prosody_analysis
+            )
+            # Step 6: Create phoneme comparison summary
+            phoneme_comparison_summary = self._create_phoneme_comparison_summary(
+                analysis_result["phoneme_pairs"]
+            )
+            # Step 7: Assemble result with backward compatibility
+            result = self._create_enhanced_result(
+                asr_result, analysis_result, overall_score, feedback,
+                prosody_analysis, phoneme_comparison_summary, assessment_mode
+            )
+            # Add processing metadata
+            processing_time = time.time() - start_time
+            result["processing_info"] = {
+                "processing_time": round(processing_time, 2),
+                "mode": assessment_mode.value,
+                "model_used": "Wav2Vec2-Enhanced",
+                "onnx_enabled": self.asr.use_onnx,
+                "confidence": asr_result["confidence"],
+                "enhanced_features": True,
+                "character_level_analysis": assessment_mode == AssessmentMode.WORD,
+                "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE
+            }
+            logger.info(f"Production assessment completed in {processing_time:.2f}s")
+            return result
+        except Exception as e:
+            logger.error(f"Production assessment error: {e}")
+            return self._create_error_result(f"Assessment failed: {str(e)}")
+    def _normalize_mode(self, mode: str, reference_text: str) -> AssessmentMode:
+        """Normalize mode parameter with backward compatibility"""
+        # Legacy mode mapping
+        legacy_mapping = {
+            "normal": AssessmentMode.AUTO,
+            "advanced": AssessmentMode.AUTO
         }
+        if mode in legacy_mapping:
+            normalized_mode = legacy_mapping[mode]
+            logger.info(f"Mapped legacy mode '{mode}' to '{normalized_mode.value}'")
+            mode = normalized_mode.value
+        # Validate mode
+        try:
+            assessment_mode = AssessmentMode(mode)
+        except ValueError:
+            logger.warning(f"Invalid mode '{mode}', defaulting to AUTO")
+            assessment_mode = AssessmentMode.AUTO
+        # Auto-detect mode based on text length
+        if assessment_mode == AssessmentMode.AUTO:
+            word_count = len(reference_text.strip().split())
+            assessment_mode = AssessmentMode.WORD if word_count <= 3 else AssessmentMode.SENTENCE
+            logger.info(f"Auto-detected mode: {assessment_mode.value} (word count: {word_count})")
+        return assessment_mode
+    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
+        """Calculate weighted overall score"""
+        if not phoneme_comparisons:
+            return 0.0
+        total_weighted_score = 0.0
+        total_weight = 0.0
+        for comparison in phoneme_comparisons:
+            weight = comparison.get("difficulty", 0.5)  # Use difficulty as weight
+            score = comparison["score"]
+            total_weighted_score += score * weight
+            total_weight += weight
+        return total_weighted_score / total_weight if total_weight > 0 else 0.0
     def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
+        """Create phoneme comparison summary statistics"""
         total = len(phoneme_pairs)
+        if total == 0:
+            return {"total_phonemes": 0, "accuracy_percentage": 0}
         correct = sum(1 for pair in phoneme_pairs if pair["match"])
         substitutions = sum(1 for pair in phoneme_pairs if pair["type"] == "substitution")
         deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
             "substitutions": substitutions,
             "deletions": deletions,
             "insertions": insertions,
+            "accuracy_percentage": round((correct / total) * 100, 1),
+            "error_rate": round(((substitutions + deletions + insertions) / total) * 100, 1)
         }
+    def _create_enhanced_result(self, asr_result: Dict, analysis_result: Dict,
+                              overall_score: float, feedback: List[str],
+                              prosody_analysis: Dict, phoneme_summary: Dict,
+                              assessment_mode: AssessmentMode) -> Dict:
+        """Create enhanced result with backward compatibility"""
+        # Base result structure (backward compatible)
+        result = {
+            "transcript": asr_result["character_transcript"],
+            "transcript_phonemes": asr_result["phoneme_representation"],
+            "user_phonemes": asr_result["phoneme_representation"],
+            "character_transcript": asr_result["character_transcript"],
+            "overall_score": overall_score,
+            "word_highlights": analysis_result["word_highlights"],
+            "phoneme_differences": analysis_result["phoneme_differences"],
+            "wrong_words": analysis_result["wrong_words"],
+            "feedback": feedback,
+        }
+        # Enhanced features
+        result.update({
+            "reference_phonemes": analysis_result["reference_phonemes"],
+            "phoneme_pairs": analysis_result["phoneme_pairs"],
+            "phoneme_comparison": phoneme_summary,
+            "assessment_mode": assessment_mode.value,
+        })
+        # Add prosody analysis for sentence mode
+        if prosody_analysis:
+            result["prosody_analysis"] = prosody_analysis
+        # Add character-level analysis for word mode
+        if assessment_mode == AssessmentMode.WORD:
+            result["character_level_analysis"] = True
+            # Add character errors to word highlights if available
+            for word_highlight in result["word_highlights"]:
+                if "character_errors" in word_highlight:
+                    # Convert CharacterError objects to dicts for JSON serialization
+                    char_errors = []
+                    for error in word_highlight["character_errors"]:
+                        if isinstance(error, CharacterError):
+                            char_errors.append({
+                                "character": error.character,
+                                "position": error.position,
+                                "error_type": error.error_type,
+                                "expected_sound": error.expected_sound,
+                                "actual_sound": error.actual_sound,
+                                "severity": error.severity,
+                                "color": error.color
+                            })
+                        else:
+                            char_errors.append(error)
+                    word_highlight["character_errors"] = char_errors
+        return result
+    def _create_error_result(self, error_message: str) -> Dict:
+        """Create error result structure"""
+        return {
+            "transcript": "",
+            "transcript_phonemes": "",
+            "user_phonemes": "",
+            "character_transcript": "",
+            "overall_score": 0.0,
+            "word_highlights": [],
+            "phoneme_differences": [],
+            "wrong_words": [],
+            "feedback": [f"Lỗi: {error_message}"],
+            "error": error_message,
+            "assessment_mode": "error",
+            "processing_info": {
+                "processing_time": 0,
+                "mode": "error",
+                "model_used": "Wav2Vec2-Enhanced",
+                "confidence": 0.0,
+                "enhanced_features": False
             }
+        }
+    def get_system_info(self) -> Dict:
+        """Get comprehensive system information"""
+        return {
+            "version": "2.1.0-production",
+            "name": "Production Pronunciation Assessment System",
+            "modes": [mode.value for mode in AssessmentMode],
+            "features": [
+                "Enhanced Levenshtein distance phoneme alignment",
+                "Character-level error detection (word mode)",
+                "Advanced prosody analysis (sentence mode)",
+                "Vietnamese speaker-specific error patterns",
+                "Real-time confidence scoring",
+                "IPA phonetic representation with visualization",
+                "Backward compatibility with legacy APIs",
+                "Production-ready error handling"
+            ],
+            "model_info": {
+                "asr_model": self.asr.model_name,
+                "onnx_enabled": self.asr.use_onnx,
+                "sample_rate": self.asr.sample_rate
+            },
+            "assessment_modes": {
+                "word": "Detailed character and phoneme level analysis for single words or short phrases",
+                "sentence": "Word-level analysis with prosody evaluation for complete sentences",
+                "auto": "Automatically selects mode based on text length (≤3 words = word mode)"
             }
+        }
+# Backward compatibility wrapper
+class SimplePronunciationAssessor:
+    """Backward compatible wrapper for the enhanced system"""
+    def __init__(self):
+        print("Initializing Simple Pronunciation Assessor (Enhanced)...")
+        self.enhanced_assessor = ProductionPronunciationAssessor()
+        print("Enhanced Simple Pronunciation Assessor initialization completed")
+    def assess_pronunciation(self, audio_path: str, reference_text: str,
+                           mode: str = "normal") -> Dict:
+        """
+        Backward compatible assessment function
+        Args:
+            audio_path: Path to audio file
+            reference_text: Reference text to compare
+            mode: Assessment mode (supports legacy modes)
+        """
+        return self.enhanced_assessor.assess_pronunciation(audio_path, reference_text, mode)
+# Example usage
+if __name__ == "__main__":
+    # Initialize production system
+    system = ProductionPronunciationAssessor(onnx=False, quantized=False)
+    # Example word mode assessment
+    print("=== WORD MODE EXAMPLE ===")
+    word_result = system.assess_pronunciation(
+        audio_path="./hello_world.wav",
+        reference_text="hello",
+        mode="word"
+    )
+    # print(f"Word mode result keys: {list(word_result.keys())}")
+    print("Word result", word_result)
+    # Example sentence mode assessment
+    print("\n=== SENTENCE MODE EXAMPLE ===")
+    sentence_result = system.assess_pronunciation(
+        audio_path="./hello_how_are_you_today.wav",
+        reference_text="Hello, how are you today?",
+        mode="sentence"
+    )
+    print(f"Sentence mode result keys: {list(sentence_result.keys())}")
+    print("Sentence result", sentence_result)
+    # Example auto mode assessment
+    print("\n=== AUTO MODE EXAMPLE ===")
+    auto_result = system.assess_pronunciation(
+        audio_path="./hello_how_are_you_today.wav",
+        reference_text="world",  # Single word - should auto-select word mode
+        mode="auto"
+    )
+    print(f"Auto mode result: {auto_result['assessment_mode']}")
+    print("Auto result", auto_result)
+    # Backward compatibility test
+    print("\n=== BACKWARD COMPATIBILITY TEST ===")
+    legacy_assessor = SimplePronunciationAssessor()
+    legacy_result = legacy_assessor.assess_pronunciation(
+        audio_path="./hello_world.wav",
+        reference_text="pronunciation",
+        mode="normal"  # Legacy mode
+    )
+    print(f"Legacy mode result: {legacy_result}")
+    print(f"Legacy mode mapped to: {legacy_result.get('assessment_mode', 'N/A')}")
+    # System info
+    print(f"\n=== SYSTEM INFO ===")
+    system_info = system.get_system_info()
+    print(f"System version: {system_info['version']}")
+    print(f"Available modes: {system_info['modes']}")
+    print(f"Key features: {len(system_info['features'])} enhanced features")

src/apis/create_app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from src.apis.routes.lesson_route import router as router_lesson
 from src.apis.routes.evaluation_route import router as router_evaluation
 from src.apis.routes.pronunciation_route import router as router_pronunciation
 from src.apis.routes.speaking_route import router as router_speaking
 api_router = APIRouter(prefix="/api")
 api_router.include_router(router_user)
@@ -14,6 +16,7 @@ api_router.include_router(router_lesson)
 api_router.include_router(router_evaluation)
 api_router.include_router(router_pronunciation)
 api_router.include_router(router_speaking)
 def create_app():
@@ -27,4 +30,19 @@ def create_app():
         allow_headers=["*"],
     )
     return app

 from src.apis.routes.evaluation_route import router as router_evaluation
 from src.apis.routes.pronunciation_route import router as router_pronunciation
 from src.apis.routes.speaking_route import router as router_speaking
+from src.apis.routes.ipa_route import router as router_ipa
+from loguru import logger
 api_router = APIRouter(prefix="/api")
 api_router.include_router(router_user)
 api_router.include_router(router_evaluation)
 api_router.include_router(router_pronunciation)
 api_router.include_router(router_speaking)
+api_router.include_router(router_ipa)
 def create_app():
         allow_headers=["*"],
     )
+    @app.on_event("startup")
+    async def startup_event():
+        """Pre-initialize assessor on server startup for better performance"""
+        try:
+            logger.info("Pre-initializing ProductionPronunciationAssessor...")
+            from src.apis.routes.speaking_route import get_assessor
+            from src.apis.routes.ipa_route import get_assessor as get_ipa_assessor
+            # Pre-initialize both assessors (they share the same singleton)
+            get_assessor()
+            get_ipa_assessor()
+            logger.info("ProductionPronunciationAssessor pre-initialization completed!")
+        except Exception as e:
+            logger.error(f"Failed to pre-initialize assessor: {e}")
     return app

src/apis/routes/ipa_route.py ADDED Viewed

	@@ -0,0 +1,1763 @@

+from fastapi import APIRouter, HTTPException, Query, UploadFile, File, Form
+from pydantic import BaseModel
+from typing import List, Dict, Optional, Union, Any
+import json
+import random
+import re
+import tempfile
+import os
+import base64
+import subprocess
+from loguru import logger
+from src.apis.controllers.speaking_controller import (
+    EnhancedG2P,
+    ProductionPronunciationAssessor,
+)
+class CharacterMapping(BaseModel):
+    ipa_symbol: Optional[str] = None
+    grapheme: Optional[str] = None
+    start_index: Optional[int] = None
+    end_index: Optional[int] = None
+    characters: Optional[str] = None
+    chars: Optional[str] = None
+    ipa: Optional[str] = None
+    start: Optional[int] = None
+    end: Optional[int] = None
+router = APIRouter(prefix="/ipa", tags=["IPA Training"])
+# Initialize G2P converter and assessment system once (singleton pattern)
+g2p = EnhancedG2P()
+# Global assessor instance - will be initialized once due to singleton pattern
+global_assessor = None
+def get_assessor():
+    """Get or create the global assessor instance"""
+    global global_assessor
+    if global_assessor is None:
+        logger.info("Creating global ProductionPronunciationAssessor instance...")
+        global_assessor = ProductionPronunciationAssessor()
+    return global_assessor
+def map_ipa_to_characters(word: str, ipa_symbol: str) -> List[CharacterMapping]:
+    """
+    Map IPA symbols to their corresponding characters in the word
+    Returns a list of character mappings for highlighting
+    """
+    # Common IPA to grapheme mappings
+    ipa_mappings = {
+        # Vowels
+        "i": [
+            "ee",
+            "ea",
+            "e",
+            "ie",
+            "ei",
+            "i",
+        ],  # see, eat, me, piece, receive, machine
+        "ɪ": ["i", "y", "ui", "e"],  # sit, gym, build, women
+        "u": ["oo", "u", "ou", "ue", "ui", "o"],  # food, flu, soup, true, fruit, do
+        "ʊ": ["oo", "u", "ou"],  # book, put, could
+        "ɛ": ["e", "ea", "ai", "a"],  # bed, head, said, many
+        "ə": [
+            "a",
+            "e",
+            "i",
+            "o",
+            "u",
+            "ou",
+            "ar",
+            "er",
+            "or",
+        ],  # about, taken, pencil, lemon, circus, famous, dollar, butter, doctor
+        "ʌ": ["u", "o", "ou", "oo"],  # cup, love, country, blood
+        "ɑ": ["a", "o", "au"],  # father, hot, aunt
+        "æ": ["a"],  # cat, apple
+        "ɔ": ["o", "aw", "au", "a", "ou"],  # saw, law, caught, all, thought
+        # Diphthongs
+        "eɪ": ["a", "ai", "ay", "ei", "ey", "ea"],  # say, wait, day, eight, grey, break
+        "aɪ": ["i", "y", "ie", "uy", "ai", "igh"],  # my, fly, pie, buy, aisle, night
+        "ɔɪ": ["oy", "oi"],  # boy, coin
+        "aʊ": ["ou", "ow"],  # how, house
+        "oʊ": ["o", "oa", "ow", "oe", "ou"],  # go, boat, show, toe, soul
+        # Consonants
+        "p": ["p", "pp"],  # pen, apple
+        "b": ["b", "bb"],  # boy, rabbit
+        "t": ["t", "tt", "ed"],  # top, butter, walked
+        "d": ["d", "dd", "ed"],  # dog, ladder, played
+        "k": ["c", "k", "ck", "ch", "qu"],  # cat, key, back, school, queen
+        "g": ["g", "gg", "gh", "gu"],  # go, egg, ghost, guard
+        "f": ["f", "ff", "ph", "gh"],  # fish, off, phone, laugh
+        "v": ["v", "ve"],  # very, have
+        "θ": ["th"],  # think
+        "ð": ["th"],  # this
+        "s": ["s", "ss", "c", "sc", "ps"],  # see, miss, city, scene, psychology
+        "z": ["z", "zz", "s", "se", "ze"],  # zoo, buzz, is, rose, froze
+        "ʃ": [
+            "sh",
+            "s",
+            "ss",
+            "ch",
+            "ci",
+            "ti",
+        ],  # ship, sure, mission, machine, special, nation
+        "ʒ": ["s", "si", "ge"],  # measure, vision, garage
+        "tʃ": ["ch", "tch", "t"],  # chair, watch, nature
+        "dʒ": ["j", "ge", "dge", "g"],  # job, age, bridge, gym
+        "m": ["m", "mm", "mb"],  # man, hammer, lamb
+        "n": ["n", "nn", "kn", "gn"],  # no, dinner, knee, sign
+        "ŋ": ["ng", "n"],  # sing, think
+        "l": ["l", "ll"],  # love, hello
+        "r": ["r", "rr", "wr"],  # red, sorry, write
+        "j": ["y", "i", "j"],  # yes, onion, hallelujah
+        "w": ["w", "wh", "qu", "u"],  # we, what, queen, language
+        "h": ["h", "wh"],  # house, who
+    }
+    # Get possible grapheme representations for the IPA symbol
+    possible_graphemes = ipa_mappings.get(ipa_symbol, [])
+    # Find the best match in the word
+    word_lower = word.lower()
+    mappings = []
+    for grapheme in possible_graphemes:
+        start_pos = word_lower.find(grapheme)
+        if start_pos != -1:
+            mappings.append(
+                CharacterMapping(
+                    ipa_symbol=ipa_symbol,
+                    grapheme=grapheme,
+                    start_index=start_pos,
+                    end_index=start_pos + len(grapheme),
+                    characters=word[start_pos : start_pos + len(grapheme)],
+                )
+            )
+            break  # Use the first match found
+    # If no direct match found, try to match individual characters
+    if not mappings and ipa_symbol in word_lower:
+        start_pos = word_lower.find(ipa_symbol)
+        if start_pos != -1:
+            mappings.append(
+                CharacterMapping(
+                    ipa_symbol=ipa_symbol,
+                    grapheme=ipa_symbol,
+                    start_index=start_pos,
+                    end_index=start_pos + len(ipa_symbol),
+                    characters=word[start_pos : start_pos + len(ipa_symbol)],
+                )
+            )
+    return mappings
+def map_word_to_phonemes(word: str, ipa_transcription: str) -> List[CharacterMapping]:
+    """
+    Map an entire word to its phoneme sequence
+    Returns detailed character to IPA mappings for the whole word
+    """
+    # Clean the IPA transcription
+    clean_ipa = ipa_transcription.strip("/").replace("ˈ", "").replace("ˌ", "")
+    # Common word-to-IPA mappings for better accuracy
+    word_mappings = {
+        # Easy words
+        "cat": [
+            CharacterMapping(
+                characters="c", ipa_symbol="k", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="a", ipa_symbol="æ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="t", ipa_symbol="t", start_index=2, end_index=3
+            ),
+        ],
+        "dog": [
+            CharacterMapping(
+                characters="d", ipa_symbol="d", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="o", ipa_symbol="ɔ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="g", ipa_symbol="g", start_index=2, end_index=3
+            ),
+        ],
+        "pen": [
+            CharacterMapping(
+                characters="p", ipa_symbol="p", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="n", ipa_symbol="n", start_index=2, end_index=3
+            ),
+        ],
+        "see": [
+            CharacterMapping(
+                characters="s", ipa_symbol="s", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="ee", ipa_symbol="i", start_index=1, end_index=3
+            ),
+        ],
+        "bed": [
+            CharacterMapping(
+                characters="b", ipa_symbol="b", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="d", ipa_symbol="d", start_index=2, end_index=3
+            ),
+        ],
+        "fish": [
+            CharacterMapping(
+                characters="f", ipa_symbol="f", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="i", ipa_symbol="ɪ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="sh", ipa_symbol="ʃ", start_index=2, end_index=4
+            ),
+        ],
+        "book": [
+            CharacterMapping(
+                characters="b", ipa_symbol="b", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="oo", ipa_symbol="ʊ", start_index=1, end_index=3
+            ),
+            CharacterMapping(
+                characters="k", ipa_symbol="k", start_index=3, end_index=4
+            ),
+        ],
+        "food": [
+            CharacterMapping(
+                characters="f", ipa_symbol="f", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="oo", ipa_symbol="u", start_index=1, end_index=3
+            ),
+            CharacterMapping(
+                characters="d", ipa_symbol="d", start_index=3, end_index=4
+            ),
+        ],
+        "man": [
+            CharacterMapping(
+                characters="m", ipa_symbol="m", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="a", ipa_symbol="æ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="n", ipa_symbol="n", start_index=2, end_index=3
+            ),
+        ],
+        "sun": [
+            CharacterMapping(
+                characters="s", ipa_symbol="s", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="u", ipa_symbol="ʌ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="n", ipa_symbol="n", start_index=2, end_index=3
+            ),
+        ],
+        # Medium words
+        "chair": [
+            CharacterMapping(
+                characters="ch", ipa_symbol="tʃ", start_index=0, end_index=2
+            ),
+            CharacterMapping(
+                characters="ai", ipa_symbol="ɛ", start_index=2, end_index=4
+            ),
+            CharacterMapping(
+                characters="r", ipa_symbol="r", start_index=4, end_index=5
+            ),
+        ],
+        "water": [
+            CharacterMapping(
+                characters="w", ipa_symbol="w", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="a", ipa_symbol="ɔ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="t", ipa_symbol="t", start_index=2, end_index=3
+            ),
+            CharacterMapping(
+                characters="er", ipa_symbol="ər", start_index=3, end_index=5
+            ),
+        ],
+        "house": [
+            CharacterMapping(
+                characters="h", ipa_symbol="h", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="ou", ipa_symbol="aʊ", start_index=1, end_index=3
+            ),
+            CharacterMapping(
+                characters="se", ipa_symbol="s", start_index=3, end_index=5
+            ),
+        ],
+        "yellow": [
+            CharacterMapping(
+                characters="y", ipa_symbol="j", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="ll", ipa_symbol="l", start_index=2, end_index=4
+            ),
+            CharacterMapping(
+                characters="ow", ipa_symbol="oʊ", start_index=4, end_index=6
+            ),
+        ],
+        "about": [
+            CharacterMapping(
+                characters="a", ipa_symbol="ə", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="b", ipa_symbol="b", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="ou", ipa_symbol="aʊ", start_index=2, end_index=4
+            ),
+            CharacterMapping(
+                characters="t", ipa_symbol="t", start_index=4, end_index=5
+            ),
+        ],
+        # Hard words
+        "think": [
+            CharacterMapping(
+                characters="th", ipa_symbol="θ", start_index=0, end_index=2
+            ),
+            CharacterMapping(
+                characters="i", ipa_symbol="ɪ", start_index=2, end_index=3
+            ),
+            CharacterMapping(
+                characters="nk", ipa_symbol="ŋk", start_index=3, end_index=5
+            ),
+        ],
+        "this": [
+            CharacterMapping(
+                characters="th", ipa_symbol="ð", start_index=0, end_index=2
+            ),
+            CharacterMapping(
+                characters="i", ipa_symbol="ɪ", start_index=2, end_index=3
+            ),
+            CharacterMapping(
+                characters="s", ipa_symbol="s", start_index=3, end_index=4
+            ),
+        ],
+        "very": [
+            CharacterMapping(
+                characters="v", ipa_symbol="v", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
+            ),
+            CharacterMapping(
+                characters="r", ipa_symbol="r", start_index=2, end_index=3
+            ),
+            CharacterMapping(
+                characters="y", ipa_symbol="i", start_index=3, end_index=4
+            ),
+        ],
+        "through": [
+            CharacterMapping(
+                characters="th", ipa_symbol="θ", start_index=0, end_index=2
+            ),
+            CharacterMapping(
+                characters="r", ipa_symbol="r", start_index=2, end_index=3
+            ),
+            CharacterMapping(
+                characters="ough", ipa_symbol="u", start_index=3, end_index=7
+            ),
+        ],
+        "measure": [
+            CharacterMapping(
+                characters="m", ipa_symbol="m", start_index=0, end_index=1
+            ),
+            CharacterMapping(
+                characters="ea", ipa_symbol="ɛ", start_index=1, end_index=3
+            ),
+            CharacterMapping(
+                characters="s", ipa_symbol="ʒ", start_index=3, end_index=4
+            ),
+            CharacterMapping(
+                characters="ure", ipa_symbol="ər", start_index=4, end_index=7
+            ),
+        ],
+    }
+    # Check if we have a predefined mapping
+    if word.lower() in word_mappings:
+        return word_mappings[word.lower()]
+    # If no predefined mapping, try to create a basic mapping
+    # This is a simplified approach - in production, you'd use a more sophisticated G2P system
+    mappings = []
+    char_index = 0
+    # Basic character-by-character mapping (fallback)
+    for i, char in enumerate(word.lower()):
+        if char.isalpha():
+            mappings.append(
+                CharacterMapping(
+                    characters=word[i],
+                    ipa_symbol=char,  # Simplified - would need actual phoneme mapping
+                    start_index=i,
+                    end_index=i + 1,
+                )
+            )
+    return mappings
+class IPASymbol(BaseModel):
+    symbol: str
+    description: str
+    example_word: str
+    audio_example: Optional[str] = None
+    category: str  # vowel, consonant, diphthong
+    difficulty_level: str  # easy, medium, hard
+    vietnamese_tip: str
+    character_mapping: Optional[List[CharacterMapping]] = None
+class IPALesson(BaseModel):
+    id: str
+    title: str
+    description: str
+    symbols: List[IPASymbol]
+    difficulty: str
+    estimated_time: int  # minutes
+class IPAWord(BaseModel):
+    word: str
+    ipa: str
+    phonemes: List[str]
+    difficulty: str
+    meaning: str
+    example_sentence: str
+    character_mapping: Optional[List[CharacterMapping]] = None
+class IPAExercise(BaseModel):
+    word: str
+    ipa: str
+    phonemes: List[str]
+    hints: List[str]
+    difficulty: str
+# IPA Symbol data for Vietnamese learners
+IPA_SYMBOLS_DATA = {
+    # Vowels - Easy
+    "i": {
+        "desc": "High front unrounded vowel",
+        "word": "see",
+        "tip": "Như âm 'i' trong tiếng Việt nhưng dài hơn",
+        "category": "vowel",
+        "difficulty": "easy",
+    },
+    "u": {
+        "desc": "High back rounded vowel",
+        "word": "food",
+        "tip": "Như âm 'u' trong tiếng Việt nhưng dài hơn",
+        "category": "vowel",
+        "difficulty": "easy",
+    },
+    "ɑ": {
+        "desc": "Low back unrounded vowel",
+        "word": "father",
+        "tip": "Mở miệng rộng, âm 'a' sâu",
+        "category": "vowel",
+        "difficulty": "easy",
+    },
+    "ɛ": {
+        "desc": "Mid front unrounded vowel",
+        "word": "bed",
+        "tip": "Giống âm 'e' trong 'đẹp'",
+        "category": "vowel",
+        "difficulty": "easy",
+    },
+    "ɔ": {
+        "desc": "Mid back rounded vowel",
+        "word": "saw",
+        "tip": "Âm 'o' tròn môi",
+        "category": "vowel",
+        "difficulty": "easy",
+    },
+    # Vowels - Medium
+    "ɪ": {
+        "desc": "Near-close near-front unrounded vowel",
+        "word": "sit",
+        "tip": "Âm 'i' ngắn, không kéo dài",
+        "category": "vowel",
+        "difficulty": "medium",
+    },
+    "ʊ": {
+        "desc": "Near-close near-back rounded vowel",
+        "word": "put",
+        "tip": "Âm 'u' ngắn, tròn môi nhẹ",
+        "category": "vowel",
+        "difficulty": "medium",
+    },
+    "ʌ": {
+        "desc": "Mid central unrounded vowel",
+        "word": "cup",
+        "tip": "Âm 'ơ' nhưng mở miệng hơn",
+        "category": "vowel",
+        "difficulty": "medium",
+    },
+    "æ": {
+        "desc": "Near-open front unrounded vowel",
+        "word": "cat",
+        "tip": "Mở miệng rộng, âm 'a' phẳng",
+        "category": "vowel",
+        "difficulty": "medium",
+    },
+    "ə": {
+        "desc": "Schwa - mid central vowel",
+        "word": "about",
+        "tip": "Âm yếu 'ơ', thư giãn cơ miệng",
+        "category": "vowel",
+        "difficulty": "medium",
+    },
+    # Diphthongs
+    "eɪ": {
+        "desc": "Diphthong from e to i",
+        "word": "say",
+        "tip": "Từ 'e' trượt lên 'i'",
+        "category": "diphthong",
+        "difficulty": "medium",
+    },
+    "aɪ": {
+        "desc": "Diphthong from a to i",
+        "word": "my",
+        "tip": "Từ 'a' trượt lên 'i'",
+        "category": "diphthong",
+        "difficulty": "medium",
+    },
+    "ɔɪ": {
+        "desc": "Diphthong from o to i",
+        "word": "boy",
+        "tip": "Từ 'o' trượt lên 'i'",
+        "category": "diphthong",
+        "difficulty": "medium",
+    },
+    "aʊ": {
+        "desc": "Diphthong from a to u",
+        "word": "how",
+        "tip": "Từ 'a' trượt lên 'u'",
+        "category": "diphthong",
+        "difficulty": "medium",
+    },
+    "oʊ": {
+        "desc": "Diphthong from o to u",
+        "word": "go",
+        "tip": "Từ 'o' trượt lên 'u'",
+        "category": "diphthong",
+        "difficulty": "medium",
+    },
+    # Consonants - Easy
+    "p": {
+        "desc": "Voiceless bilabial plosive",
+        "word": "pen",
+        "tip": "Âm 'p' không thở ra",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "b": {
+        "desc": "Voiced bilabial plosive",
+        "word": "boy",
+        "tip": "Âm 'b' có rung dây thanh",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "t": {
+        "desc": "Voiceless alveolar plosive",
+        "word": "top",
+        "tip": "Âm 't' lưỡi chạm nướu",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "d": {
+        "desc": "Voiced alveolar plosive",
+        "word": "dog",
+        "tip": "Âm 'd' có rung d��y thanh",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "k": {
+        "desc": "Voiceless velar plosive",
+        "word": "cat",
+        "tip": "Âm 'k' cuống họng",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "g": {
+        "desc": "Voiced velar plosive",
+        "word": "go",
+        "tip": "Âm 'g' có rung dây thanh",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "m": {
+        "desc": "Bilabial nasal",
+        "word": "man",
+        "tip": "Âm 'm' qua mũi",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "n": {
+        "desc": "Alveolar nasal",
+        "word": "no",
+        "tip": "Âm 'n' lưỡi chạm nướu",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "s": {
+        "desc": "Voiceless alveolar fricative",
+        "word": "see",
+        "tip": "Âm 's' rít",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    "f": {
+        "desc": "Voiceless labiodental fricative",
+        "word": "fish",
+        "tip": "Môi dưới chạm răng trên",
+        "category": "consonant",
+        "difficulty": "easy",
+    },
+    # Consonants - Medium
+    "ʃ": {
+        "desc": "Voiceless postalveolar fricative",
+        "word": "ship",
+        "tip": "Âm 'sh', lưỡi cong",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "ʒ": {
+        "desc": "Voiced postalveolar fricative",
+        "word": "measure",
+        "tip": "Như 'ʃ' nhưng có rung dây thanh",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "tʃ": {
+        "desc": "Voiceless postalveolar affricate",
+        "word": "chair",
+        "tip": "Âm 'ch', từ 't' + 'ʃ'",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "dʒ": {
+        "desc": "Voiced postalveolar affricate",
+        "word": "job",
+        "tip": "Từ 'd' + 'ʒ'",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "l": {
+        "desc": "Lateral approximant",
+        "word": "love",
+        "tip": "Lưỡi chạm nướu, âm thoát hai bên",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "r": {
+        "desc": "Approximant",
+        "word": "red",
+        "tip": "Cuộn lưỡi nhẹ, không chạm vòm",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "j": {
+        "desc": "Palatal approximant",
+        "word": "yes",
+        "tip": "Âm 'y', lưỡi gần vòm miệng",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "w": {
+        "desc": "Labial-velar approximant",
+        "word": "we",
+        "tip": "Tròn môi như 'u', không dùng răng",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "h": {
+        "desc": "Glottal fricative",
+        "word": "house",
+        "tip": "Thở ra nhẹ từ họng",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    "z": {
+        "desc": "Voiced alveolar fricative",
+        "word": "zoo",
+        "tip": "Như 's' nhưng có rung dây thanh",
+        "category": "consonant",
+        "difficulty": "medium",
+    },
+    # Consonants - Hard (for Vietnamese speakers)
+    "θ": {
+        "desc": "Voiceless dental fricative",
+        "word": "think",
+        "tip": "Lưỡi giữa răng, thổi nhẹ",
+        "category": "consonant",
+        "difficulty": "hard",
+    },
+    "ð": {
+        "desc": "Voiced dental fricative",
+        "word": "this",
+        "tip": "Lưỡi giữa răng, rung dây thanh",
+        "category": "consonant",
+        "difficulty": "hard",
+    },
+    "v": {
+        "desc": "Voiced labiodental fricative",
+        "word": "very",
+        "tip": "Môi dưới chạm răng trên, rung dây thanh",
+        "category": "consonant",
+        "difficulty": "hard",
+    },
+    "ŋ": {
+        "desc": "Velar nasal",
+        "word": "sing",
+        "tip": "Âm 'ng' cuối từ",
+        "category": "consonant",
+        "difficulty": "hard",
+    },
+}
+# Sample word database for each difficulty level
+SAMPLE_WORDS = {
+    "easy": [
+        {
+            "word": "cat",
+            "ipa": "/kæt/",
+            "meaning": "con mèo",
+            "sentence": "The cat is sleeping.",
+        },
+        {
+            "word": "dog",
+            "ipa": "/dɔg/",
+            "meaning": "con chó",
+            "sentence": "I love my dog.",
+        },
+        {
+            "word": "man",
+            "ipa": "/mæn/",
+            "meaning": "người đàn ông",
+            "sentence": "The man is tall.",
+        },
+        {
+            "word": "pen",
+            "ipa": "/pɛn/",
+            "meaning": "cái bút",
+            "sentence": "I need a pen.",
+        },
+        {
+            "word": "sun",
+            "ipa": "/sʌn/",
+            "meaning": "mặt trời",
+            "sentence": "The sun is bright.",
+        },
+        {
+            "word": "fish",
+            "ipa": "/fɪʃ/",
+            "meaning": "con cá",
+            "sentence": "Fish live in water.",
+        },
+        {
+            "word": "book",
+            "ipa": "/bʊk/",
+            "meaning": "quyển sách",
+            "sentence": "I read a book.",
+        },
+        {
+            "word": "food",
+            "ipa": "/fud/",
+            "meaning": "thức ăn",
+            "sentence": "I like good food.",
+        },
+        {
+            "word": "see",
+            "ipa": "/si/",
+            "meaning": "nhìn thấy",
+            "sentence": "I can see you.",
+        },
+        {
+            "word": "bed",
+            "ipa": "/bɛd/",
+            "meaning": "giường",
+            "sentence": "I sleep in my bed.",
+        },
+    ],
+    "medium": [
+        {
+            "word": "water",
+            "ipa": "/ˈwɔtər/",
+            "meaning": "nước",
+            "sentence": "I drink water every day.",
+        },
+        {
+            "word": "chair",
+            "ipa": "/tʃɛr/",
+            "meaning": "cái ghế",
+            "sentence": "Please sit on the chair.",
+        },
+        {
+            "word": "school",
+            "ipa": "/skul/",
+            "meaning": "trường học",
+            "sentence": "Children go to school.",
+        },
+        {
+            "word": "mother",
+            "ipa": "/ˈmʌðər/",
+            "meaning": "mẹ",
+            "sentence": "My mother is kind.",
+        },
+        {
+            "word": "house",
+            "ipa": "/haʊs/",
+            "meaning": "ngôi nhà",
+            "sentence": "I live in a big house.",
+        },
+        {
+            "word": "yellow",
+            "ipa": "/ˈjɛloʊ/",
+            "meaning": "màu vàng",
+            "sentence": "The sun is yellow.",
+        },
+        {
+            "word": "measure",
+            "ipa": "/ˈmɛʒər/",
+            "meaning": "đo lường",
+            "sentence": "Please measure the length.",
+        },
+        {
+            "word": "pleasure",
+            "ipa": "/ˈplɛʒər/",
+            "meaning": "niềm vui",
+            "sentence": "It's a pleasure to meet you.",
+        },
+        {
+            "word": "about",
+            "ipa": "/əˈbaʊt/",
+            "meaning": "về",
+            "sentence": "Tell me about your day.",
+        },
+        {
+            "word": "family",
+            "ipa": "/ˈfæməli/",
+            "meaning": "gia đình",
+            "sentence": "I love my family.",
+        },
+    ],
+    "hard": [
+        {
+            "word": "think",
+            "ipa": "/θɪŋk/",
+            "meaning": "suy nghĩ",
+            "sentence": "I think you are right.",
+        },
+        {
+            "word": "this",
+            "ipa": "/ðɪs/",
+            "meaning": "cái này",
+            "sentence": "This is my book.",
+        },
+        {
+            "word": "very",
+            "ipa": "/ˈvɛri/",
+            "meaning": "rất",
+            "sentence": "You are very smart.",
+        },
+        {
+            "word": "through",
+            "ipa": "/θru/",
+            "meaning": "qua",
+            "sentence": "Walk through the door.",
+        },
+        {
+            "word": "weather",
+            "ipa": "/ˈwɛðər/",
+            "meaning": "thời tiết",
+            "sentence": "The weather is nice.",
+        },
+        {
+            "word": "voice",
+            "ipa": "/vɔɪs/",
+            "meaning": "giọng nói",
+            "sentence": "She has a beautiful voice.",
+        },
+        {
+            "word": "clothes",
+            "ipa": "/kloʊðz/",
+            "meaning": "quần áo",
+            "sentence": "I need new clothes.",
+        },
+        {
+            "word": "breathe",
+            "ipa": "/brið/",
+            "meaning": "thở",
+            "sentence": "Breathe slowly and deeply.",
+        },
+        {
+            "word": "although",
+            "ipa": "/ɔlˈðoʊ/",
+            "meaning": "mặc dù",
+            "sentence": "Although it's cold, I'm happy.",
+        },
+        {
+            "word": "rhythm",
+            "ipa": "/ˈrɪðəm/",
+            "meaning": "nhịp điệu",
+            "sentence": "Music has a good rhythm.",
+        },
+    ],
+}
+@router.get("/symbols", response_model=List[IPASymbol])
+async def get_ipa_symbols(
+    category: Optional[str] = Query(
+        None, description="Filter by category: vowel, consonant, diphthong"
+    )
+):
+    """Get all IPA symbols with Vietnamese tips and character mappings"""
+    try:
+        symbols = []
+        for symbol, data in IPA_SYMBOLS_DATA.items():
+            if category and data["category"] != category:
+                continue
+            # Get character mapping for the example word
+            character_mapping = map_ipa_to_characters(data["word"], symbol)
+            symbols.append(
+                IPASymbol(
+                    symbol=symbol,
+                    description=data["desc"],
+                    example_word=data["word"],
+                    category=data["category"],
+                    difficulty_level=data["difficulty"],
+                    vietnamese_tip=data["tip"],
+                    character_mapping=character_mapping,
+                )
+            )
+        # Sort by difficulty and then by symbol
+        difficulty_order = {"easy": 0, "medium": 1, "hard": 2}
+        symbols.sort(key=lambda x: (difficulty_order[x.difficulty_level], x.symbol))
+        return symbols
+    except Exception as e:
+        logger.error(f"Error getting IPA symbols: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/lessons", response_model=List[IPALesson])
+async def get_ipa_lessons():
+    """Get structured IPA lessons for progressive learning"""
+    try:
+        lessons = [
+            {
+                "id": "vowels_basic",
+                "title": "Nguyên âm cơ bản (Basic Vowels)",
+                "description": "Học các nguyên âm đơn giản nhất trong tiếng Anh",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["category"] == "vowel"
+                    and IPA_SYMBOLS_DATA[s]["difficulty"] == "easy"
+                ],
+                "difficulty": "easy",
+                "estimated_time": 15,
+            },
+            {
+                "id": "consonants_basic",
+                "title": "Phụ âm cơ bản (Basic Consonants)",
+                "description": "Các phụ âm dễ phát âm cho người Việt",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["category"] == "consonant"
+                    and IPA_SYMBOLS_DATA[s]["difficulty"] == "easy"
+                ],
+                "difficulty": "easy",
+                "estimated_time": 20,
+            },
+            {
+                "id": "vowels_intermediate",
+                "title": "Nguyên âm nâng cao (Intermediate Vowels)",
+                "description": "Các nguyên âm khó hơn, cần luyện tập kỹ",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["category"] == "vowel"
+                    and IPA_SYMBOLS_DATA[s]["difficulty"] == "medium"
+                ],
+                "difficulty": "medium",
+                "estimated_time": 25,
+            },
+            {
+                "id": "diphthongs",
+                "title": "Nguyên âm đôi (Diphthongs)",
+                "description": "Học cách phát âm nguyên âm đôi tự nhiên",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["category"] == "diphthong"
+                ],
+                "difficulty": "medium",
+                "estimated_time": 20,
+            },
+            {
+                "id": "consonants_intermediate",
+                "title": "Phụ âm trung cấp (Intermediate Consonants)",
+                "description": "Các phụ âm cần luyện tập cho người Việt",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["category"] == "consonant"
+                    and IPA_SYMBOLS_DATA[s]["difficulty"] == "medium"
+                ],
+                "difficulty": "medium",
+                "estimated_time": 30,
+            },
+            {
+                "id": "difficult_sounds",
+                "title": "Âm khó (Difficult Sounds)",
+                "description": "Những âm khó nhất cho người Việt: th, v, z",
+                "symbols": [
+                    s
+                    for s in IPA_SYMBOLS_DATA.keys()
+                    if IPA_SYMBOLS_DATA[s]["difficulty"] == "hard"
+                ],
+                "difficulty": "hard",
+                "estimated_time": 40,
+            },
+        ]
+        # Convert to proper lesson objects
+        lesson_objects = []
+        for lesson in lessons:
+            symbol_objects = []
+            for symbol_key in lesson["symbols"]:
+                data = IPA_SYMBOLS_DATA[symbol_key]
+                # Get character mapping for the example word
+                character_mapping = map_ipa_to_characters(data["word"], symbol_key)
+                symbol_objects.append(
+                    IPASymbol(
+                        symbol=symbol_key,
+                        description=data["desc"],
+                        example_word=data["word"],
+                        category=data["category"],
+                        difficulty_level=data["difficulty"],
+                        vietnamese_tip=data["tip"],
+                        character_mapping=character_mapping,
+                    )
+                )
+            lesson_objects.append(
+                IPALesson(
+                    id=lesson["id"],
+                    title=lesson["title"],
+                    description=lesson["description"],
+                    symbols=symbol_objects,
+                    difficulty=lesson["difficulty"],
+                    estimated_time=lesson["estimated_time"],
+                )
+            )
+        return lesson_objects
+    except Exception as e:
+        logger.error(f"Error getting IPA lessons: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/words", response_model=List[IPAWord])
+async def get_practice_words(
+    difficulty: str = Query("easy", description="Difficulty level: easy, medium, hard")
+):
+    """Get practice words with IPA transcription and character mappings"""
+    try:
+        if difficulty not in ["easy", "medium", "hard"]:
+            difficulty = "easy"
+        words_data = SAMPLE_WORDS.get(difficulty, SAMPLE_WORDS["easy"])
+        words = []
+        for word_data in words_data:
+            # Get phonemes using G2P
+            try:
+                phoneme_data = g2p.text_to_phonemes(word_data["word"])[0]
+                phonemes = phoneme_data["phonemes"]
+            except:
+                # Fallback to simple conversion
+                phonemes = list(word_data["word"].lower())
+            # Calculate difficulty
+            difficulty_score = 0.0
+            for phoneme in phonemes:
+                difficulty_score += g2p.get_difficulty_score(phoneme)
+            avg_difficulty = difficulty_score / len(phonemes) if phonemes else 0.3
+            word_difficulty = (
+                "hard"
+                if avg_difficulty > 0.6
+                else "medium" if avg_difficulty > 0.4 else "easy"
+            )
+            # Get character mapping for the word
+            character_mapping = map_word_to_phonemes(
+                word_data["word"], word_data["ipa"]
+            )
+            words.append(
+                IPAWord(
+                    word=word_data["word"],
+                    ipa=word_data["ipa"],
+                    phonemes=phonemes,
+                    difficulty=word_difficulty,
+                    meaning=word_data["meaning"],
+                    example_sentence=word_data["sentence"],
+                    character_mapping=character_mapping,
+                )
+            )
+        return words
+    except Exception as e:
+        logger.error(f"Error getting practice words: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/exercises", response_model=List[IPAExercise])
+async def get_ipa_exercises(
+    count: int = Query(5, ge=1, le=20), difficulty: str = Query("mixed")
+):
+    """Generate random IPA pronunciation exercises"""
+    try:
+        exercises = []
+        # Select words based on difficulty
+        if difficulty == "mixed":
+            all_words = []
+            for level in SAMPLE_WORDS.values():
+                all_words.extend(level)
+            selected_words = random.sample(all_words, min(count, len(all_words)))
+        else:
+            if difficulty not in SAMPLE_WORDS:
+                difficulty = "easy"
+            word_pool = SAMPLE_WORDS[difficulty]
+            selected_words = random.sample(word_pool, min(count, len(word_pool)))
+        for word_data in selected_words:
+            # Get phonemes
+            try:
+                phoneme_data = g2p.text_to_phonemes(word_data["word"])[0]
+                phonemes = phoneme_data["phonemes"]
+            except:
+                phonemes = list(word_data["word"].lower())
+            # Generate hints
+            hints = [
+                f"Nghĩa: {word_data['meaning']}",
+                f"Ví dụ: {word_data['sentence']}",
+                f"Số âm tiết: {len(phonemes)}",
+            ]
+            # Add specific pronunciation hints for difficult sounds
+            difficult_sounds = []
+            for phoneme in phonemes:
+                if phoneme in ["θ", "ð", "v", "z", "ʒ", "r", "w"]:
+                    difficult_sounds.append(phoneme)
+            if difficult_sounds:
+                for sound in difficult_sounds:
+                    if sound in IPA_SYMBOLS_DATA:
+                        hints.append(f"Âm /{sound}/: {IPA_SYMBOLS_DATA[sound]['tip']}")
+            exercises.append(
+                IPAExercise(
+                    word=word_data["word"],
+                    ipa=word_data["ipa"],
+                    phonemes=phonemes,
+                    hints=hints,
+                    difficulty=difficulty if difficulty != "mixed" else "easy",
+                )
+            )
+        return exercises
+    except Exception as e:
+        logger.error(f"Error generating IPA exercises: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/symbol/{symbol}")
+async def get_symbol_details(symbol: str):
+    """Get detailed information about a specific IPA symbol"""
+    try:
+        if symbol not in IPA_SYMBOLS_DATA:
+            raise HTTPException(
+                status_code=404, detail=f"IPA symbol '{symbol}' not found"
+            )
+        data = IPA_SYMBOLS_DATA[symbol]
+        # Find words containing this symbol
+        example_words = []
+        for difficulty_level, words in SAMPLE_WORDS.items():
+            for word_data in words:
+                if symbol in word_data["ipa"]:
+                    example_words.append(
+                        {
+                            "word": word_data["word"],
+                            "ipa": word_data["ipa"],
+                            "meaning": word_data["meaning"],
+                            "difficulty": difficulty_level,
+                        }
+                    )
+                    if len(example_words) >= 5:  # Limit to 5 examples
+                        break
+            if len(example_words) >= 5:
+                break
+        return {
+            "symbol": symbol,
+            "description": data["desc"],
+            "example_word": data["word"],
+            "category": data["category"],
+            "difficulty_level": data["difficulty"],
+            "vietnamese_tip": data["tip"],
+            "difficulty_score": g2p.get_difficulty_score(symbol),
+            "example_words": example_words,
+            "practice_tips": _get_practice_tips(symbol),
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting symbol details: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+def _get_practice_tips(symbol: str) -> List[str]:
+    """Get specific practice tips for a symbol"""
+    tips_map = {
+        "θ": [
+            "Đặt đầu lưỡi giữa răng trên và răng dưới",
+            "Thổi khí nhẹ qua kẽ răng",
+            "Không rung dây thanh âm",
+            "Luyện với từ: think, three, thank",
+        ],
+        "ð": [
+            "Vị trí lưỡi giống như âm θ",
+            "Nhưng phải rung dây thanh âm",
+            "Cảm nhận rung động ở cổ họng",
+            "Luyện với từ: this, that, brother",
+        ],
+        "v": [
+            "Môi dưới chạm vào răng trên",
+            "Không dùng cả hai môi như tiếng Việt",
+            "Rung dây thanh âm",
+            "Luyện với từ: very, voice, love",
+        ],
+        "r": [
+            "Cuộn lưỡi nhẹ nhàng",
+            "Không để lưỡi chạm vào vòm miệng",
+            "Không lăn lưỡi như tiếng Việt",
+            "Luyện với từ: red, run, car",
+        ],
+        "w": [
+            "Tròn môi như phát âm 'u'",
+            "Không dùng răng như âm 'v'",
+            "Môi tròn rồi mở ra nhanh",
+            "Luyện với từ: we, water, window",
+        ],
+    }
+    return tips_map.get(
+        symbol,
+        [
+            f"Luyện phát âm âm /{symbol}/ thường xuyên",
+            "Nghe và bắt chước người bản ngữ",
+            "Tập trung vào vị trí lưỡi và môi",
+            "Luyện tập với từ đơn giản trước",
+        ],
+    )
+@router.get("/word-analysis/{word}")
+async def get_word_analysis(word: str):
+    """Get comprehensive analysis of a word for IPA learning"""
+    try:
+        # Get phoneme data
+        phoneme_data = g2p.text_to_phonemes(word)[0]
+        # Calculate difficulty
+        difficulty_scores = [
+            g2p.get_difficulty_score(p) for p in phoneme_data["phonemes"]
+        ]
+        avg_difficulty = (
+            sum(difficulty_scores) / len(difficulty_scores)
+            if difficulty_scores
+            else 0.3
+        )
+        word_difficulty = (
+            "hard"
+            if avg_difficulty > 0.6
+            else "medium" if avg_difficulty > 0.4 else "easy"
+        )
+        # Get detailed phoneme analysis
+        phoneme_analysis = []
+        for i, phoneme in enumerate(phoneme_data["phonemes"]):
+            difficulty_score = g2p.get_difficulty_score(phoneme)
+            analysis = {
+                "phoneme": phoneme,
+                "position": i,
+                "difficulty_score": difficulty_score,
+                "difficulty_level": (
+                    "hard"
+                    if difficulty_score > 0.6
+                    else "medium" if difficulty_score > 0.4 else "easy"
+                ),
+                "category": IPA_SYMBOLS_DATA.get(phoneme, {}).get(
+                    "category", "unknown"
+                ),
+                "vietnamese_tip": IPA_SYMBOLS_DATA.get(phoneme, {}).get(
+                    "tip", f"Luyện âm {phoneme}"
+                ),
+                "practice_tips": _get_practice_tips(phoneme),
+            }
+            phoneme_analysis.append(analysis)
+        # Find similar words for practice
+        similar_words = []
+        for difficulty_level, words in SAMPLE_WORDS.items():
+            for word_data in words:
+                if word_data["word"] != word:
+                    # Check if shares difficult phonemes
+                    word_phonemes = g2p.text_to_phonemes(word_data["word"])[0][
+                        "phonemes"
+                    ]
+                    shared_difficult = [
+                        p
+                        for p in phoneme_data["phonemes"]
+                        if p in word_phonemes and g2p.get_difficulty_score(p) > 0.5
+                    ]
+                    if shared_difficult:
+                        similar_words.append(
+                            {
+                                "word": word_data["word"],
+                                "ipa": word_data["ipa"],
+                                "meaning": word_data["meaning"],
+                                "shared_sounds": shared_difficult,
+                                "difficulty": difficulty_level,
+                            }
+                        )
+                        if len(similar_words) >= 5:
+                            break
+            if len(similar_words) >= 5:
+                break
+        return {
+            "word": word,
+            "ipa": phoneme_data["ipa"],
+            "phonemes": phoneme_data["phonemes"],
+            "phoneme_string": phoneme_data["phoneme_string"],
+            "difficulty": word_difficulty,
+            "difficulty_score": avg_difficulty,
+            "phoneme_analysis": phoneme_analysis,
+            "similar_words": similar_words,
+            "practice_sequence": _generate_practice_sequence(phoneme_analysis),
+            "common_mistakes": _get_common_mistakes(phoneme_data["phonemes"]),
+        }
+    except Exception as e:
+        logger.error(f"Error analyzing word '{word}': {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+def _generate_practice_sequence(phoneme_analysis: List[Dict]) -> List[Dict]:
+    """Generate a practice sequence starting with easier sounds"""
+    # Sort by difficulty
+    sorted_phonemes = sorted(phoneme_analysis, key=lambda x: x["difficulty_score"])
+    sequence = []
+    for phoneme_data in sorted_phonemes:
+        step = {
+            "step": len(sequence) + 1,
+            "phoneme": phoneme_data["phoneme"],
+            "focus": "Tập trung vào âm này",
+            "tip": phoneme_data["vietnamese_tip"],
+            "practice_words": _get_practice_words_for_phoneme(phoneme_data["phoneme"]),
+        }
+        sequence.append(step)
+    return sequence
+def _get_practice_words_for_phoneme(phoneme: str) -> List[str]:
+    """Get simple words containing the phoneme"""
+    practice_words = {
+        "θ": ["think", "three", "month", "tooth"],
+        "ð": ["this", "that", "mother", "brother"],
+        "v": ["very", "voice", "love", "give"],
+        "r": ["red", "run", "car", "tree"],
+        "w": ["we", "water", "window", "want"],
+        "z": ["zoo", "zero", "buzz", "pizza"],
+        "ʒ": ["measure", "pleasure", "treasure", "vision"],
+        "æ": ["cat", "hat", "man", "bad"],
+        "ɪ": ["sit", "big", "win", "ship"],
+        "ʊ": ["put", "look", "book", "good"],
+    }
+    return practice_words.get(phoneme, [])
+def _get_common_mistakes(phonemes: List[str]) -> List[Dict]:
+    """Get common pronunciation mistakes for Vietnamese speakers"""
+    mistakes = []
+    common_mistakes_map = {
+        "θ": {
+            "mistake": "Phát âm thành 'f' hoặc 's'",
+            "correction": "Đặt lưỡi giữa răng, thổi nhẹ",
+            "examples": ["think → fink/sink (sai), think (đúng)"],
+        },
+        "ð": {
+            "mistake": "Phát âm thành 'd' hoặc 'z'",
+            "correction": "Lưỡi giữa răng + rung dây thanh",
+            "examples": ["this → dis/zis (sai), this (đúng)"],
+        },
+        "v": {
+            "mistake": "Phát âm thành 'w' hoặc 'b'",
+            "correction": "Môi dưới chạm răng trên",
+            "examples": ["very → wery/bery (sai), very (đúng)"],
+        },
+        "r": {
+            "mistake": "Lăn lưỡi như tiếng Việt",
+            "correction": "Cuộn lưỡi nhẹ, không chạm vòm",
+            "examples": ["red → rrred (sai), red (đúng)"],
+        },
+        "w": {
+            "mistake": "Phát âm thành 'v'",
+            "correction": "Tròn môi, không dùng răng",
+            "examples": ["we → ve (sai), we (đúng)"],
+        },
+    }
+    for phoneme in phonemes:
+        if phoneme in common_mistakes_map:
+            mistake_info = common_mistakes_map[phoneme]
+            mistakes.append(
+                {
+                    "phoneme": phoneme,
+                    "common_mistake": mistake_info["mistake"],
+                    "correction": mistake_info["correction"],
+                    "examples": mistake_info["examples"],
+                }
+            )
+    return mistakes
+@router.post("/assess-pronunciation")
+async def assess_ipa_pronunciation(
+    audio_file: UploadFile = File(
+        ..., description="Audio file for IPA pronunciation assessment"
+    ),
+    word: str = Form(..., description="Target word to assess"),
+    target_ipa: str = Form(None, description="Target IPA transcription (optional)"),
+    focus_phonemes: str = Form(
+        None, description="Comma-separated list of phonemes to focus on (optional)"
+    ),
+):
+    """
+    Specialized IPA pronunciation assessment with detailed phoneme analysis
+    Optimized for IPA learning with Vietnamese speaker feedback
+    """
+    import tempfile
+    import os
+    try:
+        # Get the global assessor instance (singleton)
+        assessor = get_assessor()
+        # Save uploaded audio file
+        file_extension = ".wav"
+        if audio_file.filename and "." in audio_file.filename:
+            file_extension = f".{audio_file.filename.split('.')[-1]}"
+        with tempfile.NamedTemporaryFile(
+            delete=False, suffix=file_extension
+        ) as tmp_file:
+            content = await audio_file.read()
+            tmp_file.write(content)
+            tmp_file.flush()
+            # Run standard pronunciation assessment
+            result = assessor.assess_pronunciation(tmp_file.name, word, "word")
+            # Get target IPA and phonemes
+            if not target_ipa:
+                target_phonemes_data = g2p.text_to_phonemes(word)[0]
+                target_ipa = target_phonemes_data["ipa"]
+                target_phonemes = target_phonemes_data["phonemes"]
+            else:
+                # Parse IPA to phonemes (simplified)
+                target_phonemes = target_ipa.replace("/", "").split()
+            # Focus phonemes analysis
+            focus_phonemes_list = []
+            if focus_phonemes:
+                focus_phonemes_list = [p.strip() for p in focus_phonemes.split(",")]
+            # Enhanced IPA-specific analysis
+            ipa_analysis = {
+                "target_word": word,
+                "target_ipa": target_ipa,
+                "target_phonemes": target_phonemes,
+                "user_transcript": result.get("transcript", ""),
+                "user_ipa": result.get("user_ipa", ""),
+                "user_phonemes": result.get("user_phonemes", ""),
+                "overall_score": result.get("overall_score", 0.0),
+                "phoneme_accuracy": result.get("phoneme_comparison", {}).get(
+                    "accuracy_percentage", 0
+                ),
+                "focus_phonemes_analysis": [],
+                "vietnamese_specific_tips": [],
+                "practice_recommendations": [],
+            }
+            # Focus phonemes detailed analysis
+            if focus_phonemes_list and result.get("phoneme_differences"):
+                for phoneme_diff in result["phoneme_differences"]:
+                    ref_phoneme = phoneme_diff.get("reference_phoneme", "")
+                    if ref_phoneme in focus_phonemes_list:
+                        analysis = {
+                            "phoneme": ref_phoneme,
+                            "status": phoneme_diff.get("status", "unknown"),
+                            "score": phoneme_diff.get("score", 0.0),
+                            "difficulty": g2p.get_difficulty_score(ref_phoneme),
+                            "vietnamese_tip": IPA_SYMBOLS_DATA.get(ref_phoneme, {}).get(
+                                "tip", ""
+                            ),
+                            "practice_tip": _get_practice_tips(ref_phoneme),
+                        }
+                        ipa_analysis["focus_phonemes_analysis"].append(analysis)
+            # Vietnamese-specific pronunciation tips
+            all_target_phonemes = target_phonemes + focus_phonemes_list
+            vietnamese_tips = []
+            for phoneme in set(all_target_phonemes):
+                if phoneme in [
+                    "θ",
+                    "ð",
+                    "v",
+                    "z",
+                    "ʒ",
+                    "r",
+                    "w",
+                    "æ",
+                    "ɪ",
+                    "ʊ",
+                ]:  # Difficult for Vietnamese
+                    tip_data = IPA_SYMBOLS_DATA.get(phoneme, {})
+                    if tip_data:
+                        vietnamese_tips.append(
+                            {
+                                "phoneme": phoneme,
+                                "tip": tip_data.get("tip", ""),
+                                "difficulty": tip_data.get("difficulty", "medium"),
+                                "category": tip_data.get("category", "unknown"),
+                            }
+                        )
+            ipa_analysis["vietnamese_specific_tips"] = vietnamese_tips
+            # Practice recommendations based on score
+            if result.get("overall_score", 0) < 0.7:
+                recommendations = [
+                    "Nghe từ mẫu nhiều lần trước khi phát âm",
+                    "Phát âm chậm và rõ ràng từng âm vị",
+                    "Chú ý đến vị trí lưỡi và môi khi phát âm",
+                ]
+                # Add specific recommendations for low-scoring phonemes
+                if result.get("wrong_words"):
+                    for wrong_word in result["wrong_words"][
+                        :2
+                    ]:  # Top 2 problematic words
+                        for wrong_phoneme in wrong_word.get("wrong_phonemes", [])[:2]:
+                            phoneme = wrong_phoneme.get("expected", "")
+                            if phoneme in IPA_SYMBOLS_DATA:
+                                recommendations.append(
+                                    f"Luyện đặc biệt âm /{phoneme}/: {IPA_SYMBOLS_DATA[phoneme]['tip']}"
+                                )
+                ipa_analysis["practice_recommendations"] = recommendations
+            # Combine with original result
+            enhanced_result = {
+                **result,  # Original assessment result
+                "ipa_analysis": ipa_analysis,  # IPA-specific analysis
+                "assessment_type": "ipa_focused",
+                "target_ipa": target_ipa,
+                "focus_phonemes": focus_phonemes_list,
+            }
+            # Clean up temp file
+            os.unlink(tmp_file.name)
+            logger.info(
+                f"IPA assessment completed for word '{word}' with score {result.get('overall_score', 0):.2f}"
+            )
+            return enhanced_result
+    except Exception as e:
+        logger.error(f"IPA pronunciation assessment error: {e}")
+        raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
+@router.get("/practice-session/{lesson_id}")
+async def create_ipa_practice_session(lesson_id: str):
+    """Create a structured IPA practice session"""
+    try:
+        # This would typically fetch from a database
+        # For now, we'll create a sample session based on lesson_id
+        if lesson_id == "vowels_basic":
+            session_words = [
+                {
+                    "word": "cat",
+                    "ipa": "/kæt/",
+                    "focus_phonemes": ["æ"],
+                    "mapping": map_word_to_phonemes("cat", "/kæt/"),
+                },
+                {
+                    "word": "bed",
+                    "ipa": "/bɛd/",
+                    "focus_phonemes": ["ɛ"],
+                    "mapping": map_word_to_phonemes("bed", "/bɛd/"),
+                },
+                {
+                    "word": "see",
+                    "ipa": "/si/",
+                    "focus_phonemes": ["i"],
+                    "mapping": map_word_to_phonemes("see", "/si/"),
+                },
+                {
+                    "word": "cup",
+                    "ipa": "/kʌp/",
+                    "focus_phonemes": ["ʌ"],
+                    "mapping": map_word_to_phonemes("cup", "/kʌp/"),
+                },
+                {
+                    "word": "book",
+                    "ipa": "/bʊk/",
+                    "focus_phonemes": ["ʊ"],
+                    "mapping": map_word_to_phonemes("book", "/bʊk/"),
+                },
+            ]
+        elif lesson_id == "difficult_sounds":
+            session_words = [
+                {
+                    "word": "think",
+                    "ipa": "/θɪŋk/",
+                    "focus_phonemes": ["θ"],
+                    "mapping": map_word_to_phonemes("think", "/θɪŋk/"),
+                },
+                {
+                    "word": "this",
+                    "ipa": "/ðɪs/",
+                    "focus_phonemes": ["ð"],
+                    "mapping": map_word_to_phonemes("this", "/ðɪs/"),
+                },
+                {
+                    "word": "very",
+                    "ipa": "/ˈvɛri/",
+                    "focus_phonemes": ["v"],
+                    "mapping": map_word_to_phonemes("very", "/ˈvɛri/"),
+                },
+                {
+                    "word": "water",
+                    "ipa": "/ˈwɔtər/",
+                    "focus_phonemes": ["w"],
+                    "mapping": map_word_to_phonemes("water", "/ˈwɔtər/"),
+                },
+                {
+                    "word": "red",
+                    "ipa": "/rɛd/",
+                    "focus_phonemes": ["r"],
+                    "mapping": map_word_to_phonemes("red", "/rɛd/"),
+                },
+            ]
+        else:
+            # Default session
+            session_words = [
+                {
+                    "word": "hello",
+                    "ipa": "/həˈloʊ/",
+                    "focus_phonemes": ["ə", "oʊ"],
+                    "mapping": map_word_to_phonemes("hello", "/həˈloʊ/"),
+                },
+                {
+                    "word": "world",
+                    "ipa": "/wɜrld/",
+                    "focus_phonemes": ["w", "ɜr"],
+                    "mapping": map_word_to_phonemes("world", "/wɜrld/"),
+                },
+                {
+                    "word": "practice",
+                    "ipa": "/ˈpræktɪs/",
+                    "focus_phonemes": ["æ", "ɪ"],
+                    "mapping": map_word_to_phonemes("practice", "/ˈpræktɪs/"),
+                },
+            ]
+        return {
+            "session_id": lesson_id,
+            "title": f"IPA Practice Session: {lesson_id.replace('_', ' ').title()}",
+            "words": session_words,
+            "estimated_time": len(session_words) * 3,  # 3 minutes per word
+            "instructions": [
+                "Nghe mẫu từng từ carefully",
+                "Tập trung vào âm vị được highlight",
+                "Ghi âm nhiều lần cho đến khi đạt điểm tốt",
+                "Đọc feedback để cải thiện",
+            ],
+        }
+    except Exception as e:
+        logger.error(f"Error creating practice session: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

src/apis/routes/speaking_route.py CHANGED Viewed

@@ -9,7 +9,7 @@ from loguru import logger
 from src.utils.speaking_utils import convert_numpy_types
 # Import the new evaluation system
-from evalution import ProductionPronunciationAssessor, EnhancedG2P
 warnings.filterwarnings("ignore")
 router = APIRouter(prefix="/speaking", tags=["Speaking"])
@@ -36,7 +36,16 @@ class PronunciationAssessmentResult(BaseModel):
     assessment_mode: Optional[str] = None
     character_level_analysis: Optional[bool] = None
-assessor = ProductionPronunciationAssessor()
 @router.post("/assess", response_model=PronunciationAssessmentResult)
@@ -103,7 +112,8 @@ async def assess_pronunciation(
             logger.info(f"Processing audio file: {tmp_file.name} with mode: {mode}")
-            # Run assessment using enhanced assessor
             result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
             # Get reference phonemes and IPA

 from src.utils.speaking_utils import convert_numpy_types
 # Import the new evaluation system
+from src.apis.controllers.speaking_controller import ProductionPronunciationAssessor, EnhancedG2P
 warnings.filterwarnings("ignore")
 router = APIRouter(prefix="/speaking", tags=["Speaking"])
     assessment_mode: Optional[str] = None
     character_level_analysis: Optional[bool] = None
+# Global assessor instance - singleton pattern for performance
+global_assessor = None
+def get_assessor():
+    """Get or create the global assessor instance"""
+    global global_assessor
+    if global_assessor is None:
+        logger.info("Creating global ProductionPronunciationAssessor instance...")
+        global_assessor = ProductionPronunciationAssessor()
+    return global_assessor
 @router.post("/assess", response_model=PronunciationAssessmentResult)
             logger.info(f"Processing audio file: {tmp_file.name} with mode: {mode}")
+            # Run assessment using enhanced assessor (singleton)
+            assessor = get_assessor()
             result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
             # Get reference phonemes and IPA