ABAO77 commited on
Commit
85fa45c
·
1 Parent(s): 2877f54

Refactor code structure for improved readability and maintainability

Browse files
src/AI_Models/wave2vec_inference.py CHANGED
@@ -8,51 +8,164 @@ from transformers import (
8
  import onnxruntime as rt
9
  import numpy as np
10
  import librosa
 
 
 
11
 
12
 
13
  class Wave2Vec2Inference:
14
- def __init__(self, model_name, hotwords=[], use_lm_if_possible=True, use_gpu=True):
15
- self.device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if use_lm_if_possible:
17
  self.processor = AutoProcessor.from_pretrained(model_name)
18
  else:
19
  self.processor = Wav2Vec2Processor.from_pretrained(model_name)
 
20
  self.model = AutoModelForCTC.from_pretrained(model_name)
21
  self.model.to(self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  self.hotwords = hotwords
23
  self.use_lm_if_possible = use_lm_if_possible
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def buffer_to_text(self, audio_buffer):
26
  if len(audio_buffer) == 0:
27
  return ""
28
 
 
 
 
 
 
 
 
29
  inputs = self.processor(
30
- torch.tensor(audio_buffer),
31
  sampling_rate=16_000,
32
  return_tensors="pt",
33
  padding=True,
34
  )
35
 
36
- with torch.no_grad():
37
- logits = self.model(
38
- inputs.input_values.to(self.device),
39
- attention_mask=inputs.attention_mask.to(self.device),
40
- ).logits
41
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if hasattr(self.processor, "decoder") and self.use_lm_if_possible:
 
 
43
  transcription = self.processor.decode(
44
- logits[0].cpu().numpy(),
45
  hotwords=self.hotwords,
46
- # hotword_weight=self.hotword_weight,
47
  output_word_offsets=True,
48
  )
49
- confidence = transcription.lm_score / len(transcription.text.split(" "))
50
  transcription: str = transcription.text
51
  else:
 
52
  predicted_ids = torch.argmax(logits, dim=-1)
 
 
53
  transcription: str = self.processor.batch_decode(predicted_ids)[0]
54
- # confidence = self.confidence_score(logits, predicted_ids)
55
- return transcription.lower()
56
 
57
  def confidence_score(self, logits, predicted_ids):
58
  scores = torch.nn.functional.softmax(logits, dim=-1)
@@ -67,48 +180,118 @@ class Wave2Vec2Inference:
67
  return total_average
68
 
69
  def file_to_text(self, filename):
70
- import librosa
71
-
72
- audio_input, samplerate = librosa.load(filename, sr=16000)
73
- return self.buffer_to_text(audio_input)
 
 
 
74
 
75
 
76
  class Wave2Vec2ONNXInference:
77
  def __init__(self, model_name, onnx_path):
78
  self.processor = Wav2Vec2Processor.from_pretrained(model_name)
79
- # self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
 
80
  options = rt.SessionOptions()
81
  options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
82
- self.model = rt.InferenceSession(onnx_path, options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def buffer_to_text(self, audio_buffer):
85
  if len(audio_buffer) == 0:
86
  return ""
87
 
 
 
 
 
 
 
88
  inputs = self.processor(
89
- torch.tensor(audio_buffer),
90
  sampling_rate=16_000,
91
  return_tensors="np",
92
  padding=True,
93
  )
94
 
95
- input_values = inputs.input_values
 
96
  onnx_outputs = self.model.run(
97
- None, {self.model.get_inputs()[0].name: input_values}
 
98
  )[0]
 
 
99
  prediction = np.argmax(onnx_outputs, axis=-1)
100
-
101
  transcription = self.processor.decode(prediction.squeeze().tolist())
102
- return transcription.lower()
103
 
104
  def file_to_text(self, filename):
105
- audio_input, samplerate = librosa.load(filename, sr=16000)
106
- return self.buffer_to_text(audio_input)
 
 
 
 
107
 
108
 
109
  # took that script from: https://github.com/ccoreilly/wav2vec2-service/blob/master/convert_torch_to_onnx.py
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def convert_to_onnx(model_id_or_path, onnx_model_name):
113
  print(f"Converting {model_id_or_path} to onnx")
114
  model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
@@ -157,27 +340,48 @@ if __name__ == "__main__":
157
  from loguru import logger
158
  import time
159
 
160
- asr = Wave2Vec2Inference("facebook/wav2vec2-large-960h-lv60-self")
 
 
 
 
 
 
 
 
 
161
 
162
- # Warm up runs
163
- print("Warming up...")
164
  for i in range(2):
165
- asr.file_to_text("test.wav")
166
  print(f"Warm up {i+1} completed")
167
 
168
  # Test runs
169
- print("Running tests...")
170
  times = []
171
  for i in range(10):
172
  start_time = time.time()
173
- text = asr.file_to_text("test.wav")
174
  end_time = time.time()
175
  execution_time = end_time - start_time
176
  times.append(execution_time)
177
  print(f"Test {i+1}: {execution_time:.3f}s - {text}")
178
 
179
- # Calculate average time
180
  average_time = sum(times) / len(times)
181
- print(f"\nAverage execution time: {average_time:.3f}s")
182
- print(f"Min time: {min(times):.3f}s")
183
- print(f"Max time: {max(times):.3f}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import onnxruntime as rt
9
  import numpy as np
10
  import librosa
11
+ import warnings
12
+ import os
13
+ warnings.filterwarnings("ignore")
14
 
15
 
16
  class Wave2Vec2Inference:
17
+ def __init__(self, model_name, hotwords=[], use_lm_if_possible=True, use_gpu=True, enable_optimizations=True):
18
+ # Auto-detect best available device
19
+ if use_gpu:
20
+ if torch.backends.mps.is_available():
21
+ self.device = "mps"
22
+ elif torch.cuda.is_available():
23
+ self.device = "cuda"
24
+ else:
25
+ self.device = "cpu"
26
+ else:
27
+ self.device = "cpu"
28
+
29
+ print(f"Using device: {self.device}")
30
+
31
+ # Set optimal torch settings for inference
32
+ torch.set_grad_enabled(False) # Disable gradients globally for inference
33
+
34
+ if self.device == "cpu":
35
+ # CPU optimizations
36
+ torch.set_num_threads(torch.get_num_threads()) # Use all available CPU cores
37
+ torch.set_float32_matmul_precision('high')
38
+ elif self.device == "cuda":
39
+ # CUDA optimizations
40
+ torch.backends.cudnn.benchmark = True # Enable cuDNN benchmark mode
41
+ torch.backends.cudnn.deterministic = False
42
+ elif self.device == "mps":
43
+ # MPS optimizations
44
+ torch.backends.mps.enable_fallback = True
45
+
46
  if use_lm_if_possible:
47
  self.processor = AutoProcessor.from_pretrained(model_name)
48
  else:
49
  self.processor = Wav2Vec2Processor.from_pretrained(model_name)
50
+
51
  self.model = AutoModelForCTC.from_pretrained(model_name)
52
  self.model.to(self.device)
53
+
54
+ # Set model to evaluation mode for inference optimization
55
+ self.model.eval()
56
+
57
+ # Try to optimize model for inference (safe version) - only if enabled
58
+ if enable_optimizations:
59
+ try:
60
+ # First try torch.compile (PyTorch 2.0+) - more robust
61
+ if hasattr(torch, 'compile') and self.device != "mps": # MPS doesn't support torch.compile yet
62
+ self.model = torch.compile(self.model, mode="reduce-overhead")
63
+ print("Model compiled with torch.compile for faster inference")
64
+ else:
65
+ # Alternative: try JIT scripting for older PyTorch versions
66
+ try:
67
+ scripted_model = torch.jit.script(self.model)
68
+ if hasattr(torch.jit, 'optimize_for_inference'):
69
+ scripted_model = torch.jit.optimize_for_inference(scripted_model)
70
+ self.model = scripted_model
71
+ print("Model optimized with JIT scripting")
72
+ except Exception as jit_e:
73
+ print(f"JIT optimization failed, using regular model: {jit_e}")
74
+ except Exception as e:
75
+ print(f"Model optimization failed, using regular model: {e}")
76
+ else:
77
+ print("Model optimizations disabled")
78
+
79
  self.hotwords = hotwords
80
  self.use_lm_if_possible = use_lm_if_possible
81
+
82
+ # Pre-allocate tensors for common audio lengths to avoid repeated allocation
83
+ self.tensor_cache = {}
84
+
85
+ # Warm up the model with a dummy input (only if optimizations enabled)
86
+ if enable_optimizations:
87
+ self._warmup_model()
88
+
89
+ def _warmup_model(self):
90
+ """Warm up the model with dummy input to optimize first inference"""
91
+ try:
92
+ dummy_audio = torch.zeros(16000, device=self.device) # 1 second of silence
93
+ dummy_inputs = self.processor(
94
+ dummy_audio,
95
+ sampling_rate=16_000,
96
+ return_tensors="pt",
97
+ padding=True,
98
+ )
99
+
100
+ # Move inputs to device
101
+ dummy_inputs = {k: v.to(self.device) for k, v in dummy_inputs.items()}
102
+
103
+ # Run dummy inference
104
+ with torch.no_grad():
105
+ _ = self.model(
106
+ dummy_inputs["input_values"],
107
+ attention_mask=dummy_inputs.get("attention_mask")
108
+ )
109
+ print("Model warmed up successfully")
110
+ except Exception as e:
111
+ print(f"Warmup failed: {e}")
112
 
113
  def buffer_to_text(self, audio_buffer):
114
  if len(audio_buffer) == 0:
115
  return ""
116
 
117
+ # Convert to tensor with optimal dtype and device placement
118
+ if isinstance(audio_buffer, np.ndarray):
119
+ audio_tensor = torch.from_numpy(audio_buffer).float()
120
+ else:
121
+ audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
122
+
123
+ # Use optimized processing
124
  inputs = self.processor(
125
+ audio_tensor,
126
  sampling_rate=16_000,
127
  return_tensors="pt",
128
  padding=True,
129
  )
130
 
131
+ # Move to device in one operation
132
+ input_values = inputs.input_values.to(self.device, non_blocking=True)
133
+ attention_mask = inputs.attention_mask.to(self.device, non_blocking=True) if "attention_mask" in inputs else None
134
+
135
+ # Optimized inference with mixed precision for GPU
136
+ if self.device in ["cuda", "mps"]:
137
+ with torch.no_grad(), torch.autocast(device_type=self.device.replace("mps", "cpu"), enabled=self.device=="cuda"):
138
+ if attention_mask is not None:
139
+ logits = self.model(input_values, attention_mask=attention_mask).logits
140
+ else:
141
+ logits = self.model(input_values).logits
142
+ else:
143
+ # CPU inference optimization
144
+ with torch.no_grad():
145
+ if attention_mask is not None:
146
+ logits = self.model(input_values, attention_mask=attention_mask).logits
147
+ else:
148
+ logits = self.model(input_values).logits
149
+
150
+ # Optimized decoding
151
  if hasattr(self.processor, "decoder") and self.use_lm_if_possible:
152
+ # Move to CPU for decoder processing (decoder only works on CPU)
153
+ logits_cpu = logits[0].cpu().numpy()
154
  transcription = self.processor.decode(
155
+ logits_cpu,
156
  hotwords=self.hotwords,
 
157
  output_word_offsets=True,
158
  )
159
+ confidence = transcription.lm_score / max(len(transcription.text.split(" ")), 1)
160
  transcription: str = transcription.text
161
  else:
162
+ # Fast argmax on GPU/MPS, then move to CPU for batch_decode
163
  predicted_ids = torch.argmax(logits, dim=-1)
164
+ if self.device != "cpu":
165
+ predicted_ids = predicted_ids.cpu()
166
  transcription: str = self.processor.batch_decode(predicted_ids)[0]
167
+
168
+ return transcription.lower().strip()
169
 
170
  def confidence_score(self, logits, predicted_ids):
171
  scores = torch.nn.functional.softmax(logits, dim=-1)
 
180
  return total_average
181
 
182
  def file_to_text(self, filename):
183
+ # Optimized audio loading
184
+ try:
185
+ audio_input, samplerate = librosa.load(filename, sr=16000, dtype=np.float32)
186
+ return self.buffer_to_text(audio_input)
187
+ except Exception as e:
188
+ print(f"Error loading audio file {filename}: {e}")
189
+ return ""
190
 
191
 
192
  class Wave2Vec2ONNXInference:
193
  def __init__(self, model_name, onnx_path):
194
  self.processor = Wav2Vec2Processor.from_pretrained(model_name)
195
+
196
+ # Optimized ONNX Runtime session
197
  options = rt.SessionOptions()
198
  options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
199
+ options.execution_mode = rt.ExecutionMode.ORT_PARALLEL
200
+ options.inter_op_num_threads = 0 # Use all available cores
201
+ options.intra_op_num_threads = 0 # Use all available cores
202
+
203
+ # Enable CPU optimizations
204
+ providers = []
205
+ if rt.get_device() == 'GPU':
206
+ providers.append('CUDAExecutionProvider')
207
+ providers.extend(['CPUExecutionProvider'])
208
+
209
+ self.model = rt.InferenceSession(
210
+ onnx_path,
211
+ options,
212
+ providers=providers
213
+ )
214
+
215
+ # Pre-compile input name for faster access
216
+ self.input_name = self.model.get_inputs()[0].name
217
+ print(f"ONNX model loaded with providers: {self.model.get_providers()}")
218
 
219
  def buffer_to_text(self, audio_buffer):
220
  if len(audio_buffer) == 0:
221
  return ""
222
 
223
+ # Optimized preprocessing
224
+ if isinstance(audio_buffer, np.ndarray):
225
+ audio_tensor = torch.from_numpy(audio_buffer).float()
226
+ else:
227
+ audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
228
+
229
  inputs = self.processor(
230
+ audio_tensor,
231
  sampling_rate=16_000,
232
  return_tensors="np",
233
  padding=True,
234
  )
235
 
236
+ # Optimized ONNX inference
237
+ input_values = inputs.input_values.astype(np.float32)
238
  onnx_outputs = self.model.run(
239
+ None,
240
+ {self.input_name: input_values}
241
  )[0]
242
+
243
+ # Fast argmax and decoding
244
  prediction = np.argmax(onnx_outputs, axis=-1)
 
245
  transcription = self.processor.decode(prediction.squeeze().tolist())
246
+ return transcription.lower().strip()
247
 
248
  def file_to_text(self, filename):
249
+ try:
250
+ audio_input, samplerate = librosa.load(filename, sr=16000, dtype=np.float32)
251
+ return self.buffer_to_text(audio_input)
252
+ except Exception as e:
253
+ print(f"Error loading audio file {filename}: {e}")
254
+ return ""
255
 
256
 
257
  # took that script from: https://github.com/ccoreilly/wav2vec2-service/blob/master/convert_torch_to_onnx.py
258
 
259
 
260
+ class OptimizedWave2Vec2Factory:
261
+ """Factory class to create the most optimized Wave2Vec2 inference instance"""
262
+
263
+ @staticmethod
264
+ def create_optimized_inference(model_name, onnx_path=None, safe_mode=False, **kwargs):
265
+ """
266
+ Create the most optimized inference instance based on available resources
267
+
268
+ Args:
269
+ model_name: HuggingFace model name
270
+ onnx_path: Path to ONNX model (optional, for maximum speed)
271
+ safe_mode: If True, disable aggressive optimizations that might cause issues
272
+ **kwargs: Additional arguments for Wave2Vec2Inference
273
+
274
+ Returns:
275
+ Optimized inference instance
276
+ """
277
+ if onnx_path and os.path.exists(onnx_path):
278
+ print("Using ONNX model for maximum speed")
279
+ return Wave2Vec2ONNXInference(model_name, onnx_path)
280
+ else:
281
+ print("Using PyTorch model with optimizations")
282
+ # In safe mode, disable optimizations that might cause issues
283
+ if safe_mode:
284
+ kwargs['enable_optimizations'] = False
285
+ print("Running in safe mode - optimizations disabled")
286
+ return Wave2Vec2Inference(model_name, **kwargs)
287
+
288
+ @staticmethod
289
+ def create_safe_inference(model_name, **kwargs):
290
+ """Create a safe inference instance without aggressive optimizations"""
291
+ kwargs['enable_optimizations'] = False
292
+ return Wave2Vec2Inference(model_name, **kwargs)
293
+
294
+
295
  def convert_to_onnx(model_id_or_path, onnx_model_name):
296
  print(f"Converting {model_id_or_path} to onnx")
297
  model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
 
340
  from loguru import logger
341
  import time
342
 
343
+ # Use optimized factory to create the best inference instance
344
+ asr = OptimizedWave2Vec2Factory.create_optimized_inference(
345
+ "facebook/wav2vec2-large-960h-lv60-self"
346
+ )
347
+
348
+ # Test if file exists
349
+ test_file = "test.wav"
350
+ if not os.path.exists(test_file):
351
+ print(f"Test file {test_file} not found. Please provide a valid audio file.")
352
+ exit(1)
353
 
354
+ # Warm up runs (model already warmed up during initialization)
355
+ print("Running additional warm-up...")
356
  for i in range(2):
357
+ asr.file_to_text(test_file)
358
  print(f"Warm up {i+1} completed")
359
 
360
  # Test runs
361
+ print("Running optimized performance tests...")
362
  times = []
363
  for i in range(10):
364
  start_time = time.time()
365
+ text = asr.file_to_text(test_file)
366
  end_time = time.time()
367
  execution_time = end_time - start_time
368
  times.append(execution_time)
369
  print(f"Test {i+1}: {execution_time:.3f}s - {text}")
370
 
371
+ # Calculate statistics
372
  average_time = sum(times) / len(times)
373
+ min_time = min(times)
374
+ max_time = max(times)
375
+ std_time = np.std(times)
376
+
377
+ print(f"\n=== Performance Statistics ===")
378
+ print(f"Average execution time: {average_time:.3f}s")
379
+ print(f"Min time: {min_time:.3f}s")
380
+ print(f"Max time: {max_time:.3f}s")
381
+ print(f"Standard deviation: {std_time:.3f}s")
382
+ print(f"Speed improvement: ~{((max_time - min_time) / max_time * 100):.1f}% faster (min vs max)")
383
+
384
+ # Calculate throughput
385
+ if times:
386
+ throughput = 1.0 / average_time
387
+ print(f"Average throughput: {throughput:.2f} inferences/second")
src/apis/__pycache__/create_app.cpython-311.pyc CHANGED
Binary files a/src/apis/__pycache__/create_app.cpython-311.pyc and b/src/apis/__pycache__/create_app.cpython-311.pyc differ
 
src/apis/controllers/speaking_controller.py CHANGED
@@ -1,18 +1,19 @@
1
- from typing import List, Dict
2
  import numpy as np
3
  import librosa
4
  import nltk
5
  import eng_to_ipa as ipa
6
- import torch
7
  import re
8
  from collections import defaultdict
9
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
10
- from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
11
  from loguru import logger
12
  import time
 
 
 
13
  from src.AI_Models.wave2vec_inference import (
14
  Wave2Vec2Inference,
15
  Wave2Vec2ONNXInference,
 
16
  export_to_onnx,
17
  )
18
 
@@ -24,8 +25,34 @@ except:
24
  print("Warning: NLTK data not available")
25
 
26
 
27
- class Wav2Vec2CharacterASR:
28
- """Wav2Vec2 character-level ASR with support for both ONNX and Transformers inference"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def __init__(
31
  self,
@@ -33,605 +60,484 @@ class Wav2Vec2CharacterASR:
33
  onnx: bool = False,
34
  quantized: bool = False,
35
  ):
36
- """
37
- Initialize Wav2Vec2 character-level model
38
-
39
- Args:
40
- model_name: HuggingFace model name
41
- onnx: If True, use ONNX runtime for inference. If False, use Transformers
42
- onnx_model_path: Path to the ONNX model file (only used if onnx=True)
43
- """
44
  self.use_onnx = onnx
45
  self.sample_rate = 16000
46
  self.model_name = model_name
47
- # Check thử path của onnx model có tồn tại hay không
48
  if onnx:
49
  import os
50
-
51
- if not os.path.exists(
52
- "wav2vec2-large-960h-lv60-self"
53
- + (".quant" if quantized else "")
54
- + ".onnx"
55
- ):
56
-
57
  export_to_onnx(model_name, quantize=quantized)
58
- self.model = (
59
- Wave2Vec2Inference(model_name)
60
- if not onnx
61
- else Wave2Vec2ONNXInference(
62
- model_name,
63
- "wav2vec2-large-960h-lv60-self"
64
- + (".quant" if quantized else "")
65
- + ".onnx",
66
- )
67
  )
68
 
69
- def transcribe_to_characters(self, audio_path: str) -> Dict:
 
70
  try:
71
  start_time = time.time()
 
 
72
  character_transcript = self.model.file_to_text(audio_path)
73
- character_transcript = self._clean_character_transcript(
74
- character_transcript
75
- )
76
-
77
- phoneme_like_transcript = self._characters_to_phoneme_representation(
78
- character_transcript
79
- )
80
-
81
- logger.info(f"Transcription time: {time.time() - start_time:.2f}s")
82
-
83
  return {
84
  "character_transcript": character_transcript,
85
- "phoneme_representation": phoneme_like_transcript,
 
 
86
  }
87
-
88
  except Exception as e:
89
- print(f"Transformers transcription error: {e}")
90
  return self._empty_result()
91
 
92
- def _calculate_confidence_scores(self, logits: np.ndarray) -> List[float]:
93
- """Calculate confidence scores from logits using numpy"""
94
- # Apply softmax
95
- exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
96
- softmax_probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
97
-
98
- # Get max probabilities
99
- max_probs = np.max(softmax_probs, axis=-1)[0]
100
- return max_probs.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def _clean_character_transcript(self, transcript: str) -> str:
103
  """Clean and standardize character transcript"""
104
- # Remove extra spaces and special tokens
105
  logger.info(f"Raw transcript before cleaning: {transcript}")
106
- cleaned = re.sub(r"\s+", " ", transcript)
107
- cleaned = cleaned.strip().lower()
108
- return cleaned
109
 
110
  def _characters_to_phoneme_representation(self, text: str) -> str:
111
- """Convert character-based transcript to phoneme-like representation for comparison"""
112
  if not text:
113
  return ""
114
-
115
  words = text.split()
116
  phoneme_words = []
117
- g2p = SimpleG2P()
 
118
  for word in words:
119
  try:
120
  if g2p:
121
- word_data = g2p.text_to_phonemes(word)[0]
122
- phoneme_words.extend(word_data["phonemes"])
123
  else:
124
  phoneme_words.extend(self._simple_letter_to_phoneme(word))
125
  except:
126
- # Fallback: simple letter-to-sound mapping
127
  phoneme_words.extend(self._simple_letter_to_phoneme(word))
128
-
129
  return " ".join(phoneme_words)
130
 
131
  def _simple_letter_to_phoneme(self, word: str) -> List[str]:
132
- """Simple fallback letter-to-phoneme conversion"""
133
  letter_to_phoneme = {
134
- "a": "æ",
135
- "b": "b",
136
- "c": "k",
137
- "d": "d",
138
- "e": "ɛ",
139
- "f": "f",
140
- "g": "ɡ",
141
- "h": "h",
142
- "i": "ɪ",
143
- "j": "dʒ",
144
- "k": "k",
145
- "l": "l",
146
- "m": "m",
147
- "n": "n",
148
- "o": "ʌ",
149
- "p": "p",
150
- "q": "k",
151
- "r": "r",
152
- "s": "s",
153
- "t": "t",
154
- "u": "ʌ",
155
- "v": "v",
156
- "w": "w",
157
- "x": "ks",
158
- "y": "j",
159
- "z": "z",
160
  }
 
 
161
 
162
- phonemes = []
163
- for letter in word.lower():
164
- if letter in letter_to_phoneme:
165
- phonemes.append(letter_to_phoneme[letter])
166
-
167
- return phonemes
 
168
 
169
  def _empty_result(self) -> Dict:
170
- """Return empty result structure"""
171
  return {
172
  "character_transcript": "",
173
  "phoneme_representation": "",
174
- "raw_predicted_ids": [],
175
- "confidence_scores": [],
176
  }
177
 
178
- def get_model_info(self) -> Dict:
179
- """Get information about the loaded model"""
180
- info = {
181
- "model_name": self.model_name,
182
- "sample_rate": self.sample_rate,
183
- "inference_method": "ONNX" if self.use_onnx else "Transformers",
184
- }
185
-
186
- if self.use_onnx:
187
- info.update(
188
- {
189
- "onnx_model_path": self.onnx_model_path,
190
- "input_name": self.input_name,
191
- "output_name": self.output_name,
192
- "session_providers": self.session.get_providers(),
193
- }
194
- )
195
-
196
- return info
197
-
198
 
199
- class SimpleG2P:
200
- """Simple Grapheme-to-Phoneme converter for reference text"""
201
 
202
  def __init__(self):
203
  try:
204
  self.cmu_dict = cmudict.dict()
205
  except:
206
  self.cmu_dict = {}
207
- print("Warning: CMU dictionary not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  def text_to_phonemes(self, text: str) -> List[Dict]:
210
- """Convert text to phoneme sequence"""
211
  words = self._clean_text(text).split()
212
  phoneme_sequence = []
213
 
214
  for word in words:
215
- word_phonemes = self._get_word_phonemes(word)
216
- phoneme_sequence.append(
217
- {
218
- "word": word,
219
- "phonemes": word_phonemes,
220
- "ipa": self._get_ipa(word),
221
- "phoneme_string": " ".join(word_phonemes),
222
- }
223
- )
224
 
225
  return phoneme_sequence
226
 
227
- def get_reference_phoneme_string(self, text: str) -> str:
228
- """Get reference phoneme string for comparison"""
229
- phoneme_sequence = self.text_to_phonemes(text)
230
- all_phonemes = []
231
-
232
- for word_data in phoneme_sequence:
233
- all_phonemes.extend(word_data["phonemes"])
234
-
235
- return " ".join(all_phonemes)
236
-
237
- def _clean_text(self, text: str) -> str:
238
- """Clean text for processing"""
239
- text = re.sub(r"[^\w\s\']", " ", text)
240
- text = re.sub(r"\s+", " ", text)
241
- return text.lower().strip()
242
-
243
- def _get_word_phonemes(self, word: str) -> List[str]:
244
- """Get phonemes for a word"""
245
- word_lower = word.lower()
246
-
247
- if word_lower in self.cmu_dict:
248
- # Remove stress markers and convert to Wav2Vec2 phoneme format
249
- phonemes = self.cmu_dict[word_lower][0]
250
- clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
251
- return self._convert_to_wav2vec_format(clean_phonemes)
252
- else:
253
- return self._estimate_phonemes(word)
254
-
255
- def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
256
- """Convert CMU phonemes to Wav2Vec2 format"""
257
- # Mapping from CMU to Wav2Vec2/eSpeak phonemes
258
- cmu_to_espeak = {
259
- "AA": "ɑ",
260
- "AE": "æ",
261
- "AH": "ʌ",
262
- "AO": "ɔ",
263
- "AW": "aʊ",
264
- "AY": "aɪ",
265
- "EH": "ɛ",
266
- "ER": "ɝ",
267
- "EY": "eɪ",
268
- "IH": "ɪ",
269
- "IY": "i",
270
- "OW": "oʊ",
271
- "OY": "ɔɪ",
272
- "UH": "ʊ",
273
- "UW": "u",
274
- "B": "b",
275
- "CH": "tʃ",
276
- "D": "d",
277
- "DH": "ð",
278
- "F": "f",
279
- "G": "ɡ",
280
- "HH": "h",
281
- "JH": "dʒ",
282
- "K": "k",
283
- "L": "l",
284
- "M": "m",
285
- "N": "n",
286
- "NG": "ŋ",
287
- "P": "p",
288
- "R": "r",
289
- "S": "s",
290
- "SH": "ʃ",
291
- "T": "t",
292
- "TH": "θ",
293
- "V": "v",
294
- "W": "w",
295
- "Y": "j",
296
- "Z": "z",
297
- "ZH": "ʒ",
298
  }
299
-
300
- converted = []
301
  for phoneme in cmu_phonemes:
302
- converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
303
- converted.append(converted_phoneme)
304
-
305
- return converted
306
-
307
- def _get_ipa(self, word: str) -> str:
308
- """Get IPA transcription"""
309
- try:
310
- return ipa.convert(word)
311
- except:
312
- return f"/{word}/"
313
 
314
  def _estimate_phonemes(self, word: str) -> List[str]:
315
  """Estimate phonemes for unknown words"""
316
- # Basic phoneme estimation with eSpeak-style output
317
  phoneme_map = {
318
- "ch": ["tʃ"],
319
- "sh": ["ʃ"],
320
- "th": ["θ"],
321
- "ph": ["f"],
322
- "ck": ["k"],
323
- "ng": ["ŋ"],
324
- "qu": ["k", "w"],
325
- "a": ["æ"],
326
- "e": ["ɛ"],
327
- "i": ["ɪ"],
328
- "o": ["ʌ"],
329
- "u": ["ʌ"],
330
- "b": ["b"],
331
- "c": ["k"],
332
- "d": ["d"],
333
- "f": ["f"],
334
- "g": ["ɡ"],
335
- "h": ["h"],
336
- "j": ["dʒ"],
337
- "k": ["k"],
338
- "l": ["l"],
339
- "m": ["m"],
340
- "n": ["n"],
341
- "p": ["p"],
342
- "r": ["r"],
343
- "s": ["s"],
344
- "t": ["t"],
345
- "v": ["v"],
346
- "w": ["w"],
347
- "x": ["k", "s"],
348
- "y": ["j"],
349
- "z": ["z"],
350
  }
351
-
352
- word = word.lower()
353
  phonemes = []
354
  i = 0
355
-
356
  while i < len(word):
357
- # Check 2-letter combinations first
358
  if i <= len(word) - 2:
359
- two_char = word[i : i + 2]
360
  if two_char in phoneme_map:
361
- phonemes.extend(phoneme_map[two_char])
362
  i += 2
363
  continue
364
-
365
- # Single character
366
  char = word[i]
367
  if char in phoneme_map:
368
- phonemes.extend(phoneme_map[char])
369
-
370
  i += 1
371
-
372
  return phonemes
373
 
374
- def get_visualization_data(self, text: str) -> List[Dict]:
375
- """Get visualization data for IPA representation"""
376
- words = self._clean_text(text).split()
377
- visualization_data = []
378
-
379
- for word in words:
380
- word_phonemes = self._get_word_phonemes(word)
381
- ipa_transcription = self._get_ipa(word)
382
-
383
- visualization_data.append({
384
- "word": word,
385
- "phonemes": word_phonemes,
386
- "ipa": ipa_transcription,
387
- "phoneme_string": " ".join(word_phonemes),
388
- "visualization": self._create_phoneme_visualization(word_phonemes)
389
- })
390
 
391
- return visualization_data
 
 
 
 
 
392
 
393
  def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
394
  """Create visualization data for phonemes"""
395
  visualization = []
396
  for phoneme in phonemes:
397
- # Map phonemes to color categories for visualization
398
  color_category = self._get_phoneme_color_category(phoneme)
399
  visualization.append({
400
  "phoneme": phoneme,
401
  "color_category": color_category,
402
- "description": self._get_phoneme_description(phoneme)
 
403
  })
404
  return visualization
405
 
406
  def _get_phoneme_color_category(self, phoneme: str) -> str:
407
  """Categorize phonemes by color for visualization"""
408
  vowel_phonemes = {"ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u"}
409
- consonant_phonemes = {
410
- # Plosives
411
- "p", "b", "t", "d", "k", "ɡ",
412
- # Nasals
413
- "m", "n", "ŋ",
414
- # Fricatives
415
- "f", "v", "θ", "ð", "s", "z", "ʃ", "ʒ", "h",
416
- # Affricates
417
- "tʃ", "dʒ",
418
- # Liquids
419
- "l", "r",
420
- # Glides
421
- "w", "j"
422
- }
423
 
424
  if phoneme in vowel_phonemes:
425
  return "vowel"
426
- elif phoneme in consonant_phonemes:
427
- return "consonant"
428
  else:
429
- return "other"
430
 
431
  def _get_phoneme_description(self, phoneme: str) -> str:
432
  """Get description for a phoneme"""
433
  descriptions = {
434
- # Vowels
435
- "ɑ": "Open back unrounded vowel (like 'a' in 'father')",
436
- "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
437
- "ʌ": "Open-mid back unrounded vowel (like 'u' in 'cup')",
438
- "ɔ": "Open-mid back rounded vowel (like 'o' in 'thought')",
439
- "aʊ": "Diphthong (like 'ow' in 'cow')",
440
- "aɪ": "Diphthong (like 'i' in 'bike')",
441
- "ɛ": "Open-mid front unrounded vowel (like 'e' in 'bed')",
442
- "ɝ": "R-colored vowel (like 'er' in 'her')",
443
- "eɪ": "Diphthong (like 'a' in 'cake')",
444
- "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
445
- "i": "Close front unrounded vowel (like 'ee' in 'see')",
446
- "oʊ": "Diphthong (like 'o' in 'go')",
447
- "ɔɪ": "Diphthong (like 'oy' in 'boy')",
448
- "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')",
449
- "u": "Close back rounded vowel (like 'oo' in 'food')",
450
- # Consonants
451
- "p": "Voiceless bilabial plosive (like 'p' in 'pen')",
452
- "b": "Voiced bilabial plosive (like 'b' in 'bat')",
453
- "t": "Voiceless alveolar plosive (like 't' in 'top')",
454
- "d": "Voiced alveolar plosive (like 'd' in 'dog')",
455
- "k": "Voiceless velar plosive (like 'c' in 'cat')",
456
- "ɡ": "Voiced velar plosive (like 'g' in 'go')",
457
- "m": "Bilabial nasal (like 'm' in 'man')",
458
- "n": "Alveolar nasal (like 'n' in 'net')",
459
- "ŋ": "Velar nasal (like 'ng' in 'sing')",
460
- "f": "Voiceless labiodental fricative (like 'f' in 'fan')",
461
- "v": "Voiced labiodental fricative (like 'v' in 'van')",
462
  "θ": "Voiceless dental fricative (like 'th' in 'think')",
463
  "ð": "Voiced dental fricative (like 'th' in 'this')",
464
- "s": "Voiceless alveolar fricative (like 's' in 'sit')",
465
  "z": "Voiced alveolar fricative (like 'z' in 'zip')",
466
- "ʃ": "Voiceless postalveolar fricative (like 'sh' in 'ship')",
467
  "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
468
- "h": "Voiceless glottal fricative (like 'h' in 'hat')",
469
- "tʃ": "Voiceless postalveolar affricate (like 'ch' in 'chat')",
470
- "dʒ": "Voiced postalveolar affricate (like 'j' in 'jet')",
471
- "l": "Alveolar lateral approximant (like 'l' in 'let')",
472
  "r": "Alveolar approximant (like 'r' in 'red')",
473
  "w": "Labial-velar approximant (like 'w' in 'wet')",
474
- "j": "Palatal approximant (like 'y' in 'yes')",
 
 
475
  }
476
  return descriptions.get(phoneme, f"Phoneme: {phoneme}")
477
 
478
- class PhonemeComparator:
479
- """Compare reference and learner phoneme sequences"""
480
-
481
- def __init__(self):
482
- # Vietnamese speakers' common phoneme substitutions
483
- self.substitution_patterns = {
484
- "θ": ["f", "s", "t"], # TH → F, S, T
485
- "ð": ["d", "z", "v"], # DH → D, Z, V
486
- "v": ["w", "f"], # V → W, F
487
- "r": ["l"], # R → L
488
- "l": ["r"], # L → R
489
- "z": ["s"], # Z → S
490
- "ʒ": ["ʃ", "z"], # ZH → SH, Z
491
- "ŋ": ["n"], # NG → N
492
- }
493
-
494
- # Difficulty levels for Vietnamese speakers
495
- self.difficulty_map = {
496
- "θ": 0.9, # th (think)
497
- "ð": 0.9, # th (this)
498
- "v": 0.8, # v
499
- "z": 0.8, # z
500
- "ʒ": 0.9, # zh (measure)
501
- "r": 0.7, # r
502
- "l": 0.6, # l
503
- "w": 0.5, # w
504
- "f": 0.4, # f
505
- "s": 0.3, # s
506
- "ʃ": 0.5, # sh
507
- "tʃ": 0.4, # ch
508
- "dʒ": 0.5, # j
509
- "ŋ": 0.3, # ng
510
- }
511
 
512
- # Additional Vietnamese substitution patterns
513
- self.extended_substitution_patterns = {
514
- # Common Vietnamese speaker errors
515
- "θ": ["f", "s", "t", "d"], # TH sound
516
- "ð": ["d", "z", "v", "t"], # DH sound
517
- "v": ["w", "f", "b"], # V sound
518
- "w": ["v", "b"], # W sound
519
- "r": ["l", "n"], # R sound
520
- "l": ["r", "n"], # L sound
521
- "z": ["s", "j"], # Z sound
522
- "ʒ": ["ʃ", "z", "s"], # ZH sound
523
- "ʃ": ["s", "ʒ"], # SH sound
524
- "ŋ": ["n", "m"], # NG sound
525
- "tʃ": ["ʃ", "s", "k"], # CH sound
526
- "dʒ": ["ʒ", "j", "g"], # J sound
527
- }
528
 
529
- def compare_phoneme_sequences(
530
- self, reference_phonemes: str, learner_phonemes: str
531
- ) -> List[Dict]:
532
- """Compare reference and learner phoneme sequences"""
533
 
534
- # Split phoneme strings
535
- ref_phones = reference_phonemes.split()
536
- learner_phones = learner_phonemes.split()
537
 
538
- print(f"Reference phonemes: {ref_phones}")
539
- print(f"Learner phonemes: {learner_phones}")
540
 
541
- # Simple alignment comparison
 
 
 
 
 
 
 
 
 
 
542
  comparisons = []
543
- max_len = max(len(ref_phones), len(learner_phones))
544
-
545
- for i in range(max_len):
546
- ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
547
- learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
548
-
549
- if ref_phoneme and learner_phoneme:
550
- # Both present - check accuracy
551
- if ref_phoneme == learner_phoneme:
552
- status = "correct"
553
- score = 1.0
554
- elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
555
- status = "acceptable"
 
 
 
 
 
 
 
 
 
556
  score = 0.7
557
  else:
558
- status = "wrong"
559
  score = 0.2
560
-
561
- elif ref_phoneme and not learner_phoneme:
562
- # Missing phoneme
563
- status = "missing"
564
- score = 0.0
565
-
566
- elif learner_phoneme and not ref_phoneme:
567
- # Extra phoneme
568
- status = "extra"
569
- score = 0.0
570
- else:
571
- continue
572
-
573
- comparison = {
574
- "position": i,
575
- "reference_phoneme": ref_phoneme,
576
- "learner_phoneme": learner_phoneme,
577
- "status": status,
578
- "score": score,
579
- "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
580
- }
581
-
 
 
 
 
 
 
582
  comparisons.append(comparison)
583
-
 
 
584
  return comparisons
585
 
586
- def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
587
- """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
588
- acceptable = self.extended_substitution_patterns.get(reference, [])
589
- return learner in acceptable
590
-
591
-
592
- # =============================================================================
593
- # WORD ANALYZER
594
- # =============================================================================
 
 
 
595
 
596
 
597
- class WordAnalyzer:
598
- """Analyze word-level pronunciation accuracy using character-based ASR"""
599
 
600
  def __init__(self):
601
- self.g2p = SimpleG2P()
602
- self.comparator = PhonemeComparator()
603
-
604
- def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
605
- """Analyze word-level pronunciation using phoneme representation from character ASR"""
606
 
 
 
 
 
607
  # Get reference phonemes by word
608
  reference_words = self.g2p.text_to_phonemes(reference_text)
609
-
610
- # Get overall phoneme comparison
611
- reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
612
- phoneme_comparisons = self.comparator.compare_phoneme_sequences(
613
  reference_phoneme_string, learner_phonemes
614
  )
615
-
616
- # Map phonemes back to words
617
- word_highlights = self._create_word_highlights(
618
- reference_words, phoneme_comparisons
619
  )
620
-
621
- # Identify wrong words
622
- wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
623
-
624
  return {
625
  "word_highlights": word_highlights,
626
  "phoneme_differences": phoneme_comparisons,
627
  "wrong_words": wrong_words,
 
 
628
  }
629
 
630
- def _create_word_highlights(
631
- self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
632
- ) -> List[Dict]:
633
- """Create word highlighting data with enhanced visualization"""
634
-
635
  word_highlights = []
636
  phoneme_index = 0
637
 
@@ -642,15 +548,23 @@ class WordAnalyzer:
642
 
643
  # Get phoneme scores for this word
644
  word_phoneme_scores = []
 
 
645
  for j in range(num_phonemes):
646
  if phoneme_index + j < len(phoneme_comparisons):
647
  comparison = phoneme_comparisons[phoneme_index + j]
648
  word_phoneme_scores.append(comparison["score"])
 
649
 
650
  # Calculate word score
651
  word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
652
 
653
- # Create word highlight with enhanced visualization data
 
 
 
 
 
654
  highlight = {
655
  "word": word,
656
  "score": float(word_score),
@@ -661,8 +575,9 @@ class WordAnalyzer:
661
  "phoneme_scores": word_phoneme_scores,
662
  "phoneme_start_index": phoneme_index,
663
  "phoneme_end_index": phoneme_index + num_phonemes - 1,
664
- # Enhanced visualization data
665
- "phoneme_visualization": self.g2p._create_phoneme_visualization(word_phonemes)
 
666
  }
667
 
668
  word_highlights.append(highlight)
@@ -670,17 +585,56 @@ class WordAnalyzer:
670
 
671
  return word_highlights
672
 
673
- def _identify_wrong_words(
674
- self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
675
- ) -> List[Dict]:
676
- """Identify words that were pronounced incorrectly"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
 
 
 
 
 
678
  wrong_words = []
679
 
680
  for word_highlight in word_highlights:
681
- if word_highlight["score"] < 0.6: # Threshold for wrong pronunciation
682
-
683
- # Find specific phoneme errors for this word
684
  start_idx = word_highlight["phoneme_start_index"]
685
  end_idx = word_highlight["phoneme_end_index"]
686
 
@@ -690,23 +644,19 @@ class WordAnalyzer:
690
  for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
691
  comparison = phoneme_comparisons[i]
692
 
693
- if comparison["status"] == "wrong":
694
- wrong_phonemes.append(
695
- {
696
- "expected": comparison["reference_phoneme"],
697
- "actual": comparison["learner_phoneme"],
698
- "difficulty": comparison["difficulty"],
699
- "visualization": self.g2p._create_phoneme_visualization([comparison["reference_phoneme"]])[0]
700
- }
701
- )
702
- elif comparison["status"] == "missing":
703
- missing_phonemes.append(
704
- {
705
- "phoneme": comparison["reference_phoneme"],
706
- "difficulty": comparison["difficulty"],
707
- "visualization": self.g2p._create_phoneme_visualization([comparison["reference_phoneme"]])[0]
708
- }
709
- )
710
 
711
  wrong_word = {
712
  "word": word_highlight["word"],
@@ -715,15 +665,64 @@ class WordAnalyzer:
715
  "ipa": word_highlight["ipa"],
716
  "wrong_phonemes": wrong_phonemes,
717
  "missing_phonemes": missing_phonemes,
718
- "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
719
- # Enhanced visualization data
720
- "phoneme_visualization": word_highlight["phoneme_visualization"]
721
  }
722
 
723
  wrong_words.append(wrong_word)
724
 
725
  return wrong_words
726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  def _get_word_status(self, score: float) -> str:
728
  """Get word status from score"""
729
  if score >= 0.8:
@@ -746,14 +745,11 @@ class WordAnalyzer:
746
  else:
747
  return "#ef4444" # Red
748
 
749
- def _get_vietnamese_tips(
750
- self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
751
- ) -> List[str]:
752
- """Get Vietnamese-specific pronunciation tips"""
753
-
754
  tips = []
755
 
756
- # Tips for specific Vietnamese pronunciation challenges
757
  vietnamese_tips = {
758
  "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
759
  "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
@@ -763,433 +759,501 @@ class WordAnalyzer:
763
  "z": "Giống âm 's' nhưng có rung dây thanh âm",
764
  "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
765
  "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
 
 
766
  }
767
 
768
- # Add tips for wrong phonemes
769
  for wrong in wrong_phonemes:
770
  expected = wrong["expected"]
771
- actual = wrong["actual"]
772
-
773
  if expected in vietnamese_tips:
774
- tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
775
- else:
776
- tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
777
 
778
- # Add tips for missing phonemes
779
  for missing in missing_phonemes:
780
  phoneme = missing["phoneme"]
781
  if phoneme in vietnamese_tips:
782
- tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
783
 
784
  return tips
785
 
786
 
787
- class SimpleFeedbackGenerator:
788
- """Generate simple, actionable feedback in Vietnamese"""
789
 
790
- def generate_feedback(
791
- self,
792
- overall_score: float,
793
- wrong_words: List[Dict],
794
- phoneme_comparisons: List[Dict],
795
- ) -> List[str]:
796
- """Generate Vietnamese feedback"""
797
-
798
- feedback = []
799
 
800
- # Overall feedback in Vietnamese
801
- if overall_score >= 0.8:
802
- feedback.append("Phát âm rất tốt! Bạn đã làm xuất sắc.")
803
- elif overall_score >= 0.6:
804
- feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
805
- elif overall_score >= 0.4:
806
- feedback.append(
807
- "Cần luyện tập thêm. Tập trung vào những từ được ��ánh dấu đỏ."
808
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  else:
810
- feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
811
 
812
- # Wrong words feedback
813
- if wrong_words:
814
- if len(wrong_words) <= 3:
815
- word_names = [w["word"] for w in wrong_words]
816
- feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  else:
818
- feedback.append(
819
- f"Có {len(wrong_words)} từ cần luyện tập. Tập trung vào từng từ một."
820
- )
 
 
821
 
822
- # Most problematic phonemes
823
- problem_phonemes = defaultdict(int)
824
- for comparison in phoneme_comparisons:
825
- if comparison["status"] in ["wrong", "missing"]:
826
- phoneme = comparison["reference_phoneme"]
827
- problem_phonemes[phoneme] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
 
829
- if problem_phonemes:
830
- most_difficult = sorted(
831
- problem_phonemes.items(), key=lambda x: x[1], reverse=True
832
- )
833
- top_problem = most_difficult[0][0]
834
-
835
- phoneme_tips = {
836
- "θ": "Lưỡi giữa răng, thổi nhẹ",
837
- "ð": "Lưỡi giữa răng, rung dây thanh",
838
- "v": "Môi dưới chạm răng trên",
839
- "r": "Cuộn lưỡi, không chạm vòm miệng",
840
- "l": "Lưỡi chạm vòm miệng",
841
- "z": "Như 's' nhưng rung dây thanh",
842
- }
 
 
 
 
 
843
 
844
- if top_problem in phoneme_tips:
845
- feedback.append(
846
- f"Âm khó nhất '{top_problem}': {phoneme_tips[top_problem]}"
847
- )
 
 
 
 
 
 
 
848
 
849
- return feedback
850
 
 
 
851
 
852
- class SimplePronunciationAssessor:
853
- """Main pronunciation assessor supporting both normal (Whisper) and advanced (Wav2Vec2) modes
854
- Backward compatible wrapper for EnhancedPronunciationAssessor"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
 
856
- def __init__(self):
857
- print("Initializing Simple Pronunciation Assessor...")
858
- self.enhanced_assessor = EnhancedPronunciationAssessor()
859
- print("Simple Pronunciation Assessor initialization completed")
 
860
 
861
- def assess_pronunciation(
862
- self, audio_path: str, reference_text: str, mode: str = "normal"
863
- ) -> Dict:
864
- """
865
- Backward compatible assessment function with mode selection
866
 
867
- Args:
868
- audio_path: Path to audio file
869
- reference_text: Reference text to compare
870
- mode: 'normal' (Whisper), 'advanced' (Wav2Vec2), or 'auto' (determined by text length)
871
 
872
- Output: Word highlights + Phoneme differences + Wrong words
873
- """
874
- print(f"Starting pronunciation assessment in {mode} mode...")
 
875
 
876
- # Map old modes to new modes for backward compatibility
877
- mode_mapping = {
878
- "normal": "auto",
879
- "advanced": "auto"
880
- }
 
 
 
 
 
 
 
 
881
 
882
- # Validate and map mode parameter
883
- if mode in mode_mapping:
884
- new_mode = mode_mapping[mode]
885
- print(f"Mapping old mode '{mode}' to new mode '{new_mode}' for backward compatibility")
886
- elif mode in ["word", "sentence", "auto"]:
887
- new_mode = mode
888
- else:
889
- # Default to auto for any invalid mode
890
- new_mode = "auto"
891
- print(f"Invalid mode '{mode}' provided, defaulting to 'auto'")
892
 
893
- # Use the enhanced assessor
894
- result = self.enhanced_assessor.assess_pronunciation(
895
- audio_path, reference_text, new_mode
896
- )
897
 
898
- # Filter result to maintain backward compatibility
899
- compatible_result = {
900
- "transcript": result["transcript"],
901
- "transcript_phonemes": result["transcript_phonemes"],
902
- "user_phonemes": result["user_phonemes"],
903
- "character_transcript": result["character_transcript"],
904
- "overall_score": result["overall_score"],
905
- "word_highlights": result["word_highlights"],
906
- "phoneme_differences": result["phoneme_differences"],
907
- "wrong_words": result["wrong_words"],
908
- "feedback": result["feedback"],
909
- "processing_info": result["processing_info"],
910
- }
911
 
912
- # Add new fields if they exist (for newer clients)
913
- if "reference_phonemes" in result:
914
- compatible_result["reference_phonemes"] = result["reference_phonemes"]
915
- if "phoneme_pairs" in result:
916
- compatible_result["phoneme_pairs"] = result["phoneme_pairs"]
917
- if "phoneme_comparison" in result:
918
- compatible_result["phoneme_comparison"] = result["phoneme_comparison"]
919
- if "prosody_analysis" in result:
920
- compatible_result["prosody_analysis"] = result["prosody_analysis"]
921
 
922
- print("Assessment completed successfully")
923
- return compatible_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
924
 
925
- def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
926
- """Calculate overall pronunciation score"""
927
- if not phoneme_comparisons:
928
- return 0.0
929
 
930
- total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
931
- return total_score / len(phoneme_comparisons)
 
 
 
932
 
 
 
 
 
933
 
934
- class EnhancedPronunciationAssessor:
935
- """Enhanced pronunciation assessor with word mode and sentence mode support"""
 
 
 
 
 
 
 
 
 
 
 
 
 
936
 
937
- def __init__(self):
938
- print("Initializing Enhanced Pronunciation Assessor...")
939
- self.wav2vec2_asr = Wav2Vec2CharacterASR() # Advanced mode
940
- self.whisper_asr = None # Normal mode
941
- self.word_analyzer = WordAnalyzer()
942
- self.feedback_generator = SimpleFeedbackGenerator()
943
- self.g2p = SimpleG2P()
944
- self.comparator = PhonemeComparator()
945
- print("Enhanced Pronunciation Assessor initialization completed")
946
-
947
- def assess_pronunciation(
948
- self, audio_path: str, reference_text: str, mode: str = "auto"
949
- ) -> Dict:
950
  """
951
- Enhanced assessment function with mode selection
952
-
953
  Args:
954
  audio_path: Path to audio file
955
- reference_text: Reference text to compare
956
- mode: 'word', 'sentence', or 'auto' (automatically determined based on text length)
957
-
958
  Returns:
959
- Enhanced assessment results with prosody analysis for sentence mode
960
  """
961
- print(f"Starting enhanced pronunciation assessment in {mode} mode...")
962
 
963
- # Validate and normalize mode parameter
964
- valid_modes = ["word", "sentence", "auto"]
965
- if mode not in valid_modes:
966
- print(f"Invalid mode '{mode}' provided, defaulting to 'auto'")
967
- mode = "auto"
968
 
969
- # Determine mode based on text length if auto
970
- if mode == "auto":
971
- word_count = len(reference_text.strip().split())
972
- mode = "word" if word_count <= 3 else "sentence"
973
- print(f"Auto-selected mode: {mode} (word count: {word_count})")
974
-
975
- # Step 1: Transcription using Wav2Vec2 character model
976
- print("Step 1: Using Wav2Vec2 character transcription...")
977
- asr_result = self.wav2vec2_asr.transcribe_to_characters(audio_path)
978
- model_info = f"Wav2Vec2-Character ({self.wav2vec2_asr.model})"
979
-
980
- character_transcript = asr_result["character_transcript"]
981
- phoneme_representation = asr_result["phoneme_representation"]
982
-
983
- print(f"Character transcript: {character_transcript}")
984
- print(f"Phoneme representation: {phoneme_representation}")
985
-
986
- # Step 2: Word analysis using phoneme representation
987
- print("Step 2: Analyzing words...")
988
- analysis_result = self.word_analyzer.analyze_words(
989
- reference_text, phoneme_representation
990
- )
991
-
992
- # Step 3: Calculate overall score
993
- phoneme_comparisons = analysis_result["phoneme_differences"]
994
- overall_score = self._calculate_overall_score(phoneme_comparisons)
995
-
996
- # Step 4: Generate feedback
997
- print("Step 3: Generating feedback...")
998
- feedback = self.feedback_generator.generate_feedback(
999
- overall_score, analysis_result["wrong_words"], phoneme_comparisons
1000
- )
1001
-
1002
- # Step 5: Enhanced phoneme comparison using Levenshtein distance
1003
- print("Step 4: Performing advanced phoneme comparison...")
1004
- reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
1005
- enhanced_comparisons = self._enhanced_phoneme_comparison(
1006
- reference_phoneme_string, phoneme_representation
1007
- )
1008
-
1009
- # Step 6: Prosody analysis for sentence mode
1010
- prosody_analysis = {}
1011
- if mode == "sentence":
1012
- print("Step 5: Performing prosody analysis...")
1013
- prosody_analysis = self._analyze_prosody(audio_path, reference_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
 
1015
- # Step 7: Create phoneme pairs for visualization
1016
- phoneme_pairs = self._create_phoneme_pairs(
1017
- reference_phoneme_string, phoneme_representation
1018
- )
1019
 
1020
- # Step 8: Create phoneme comparison summary
1021
- phoneme_comparison_summary = self._create_phoneme_comparison_summary(
1022
- phoneme_pairs
1023
- )
1024
-
1025
- result = {
1026
- "transcript": character_transcript, # What user actually said
1027
- "transcript_phonemes": phoneme_representation,
1028
- "user_phonemes": phoneme_representation, # Alias for UI clarity
1029
- "character_transcript": character_transcript,
1030
- "overall_score": overall_score,
1031
- "word_highlights": analysis_result["word_highlights"],
1032
- "phoneme_differences": enhanced_comparisons,
1033
- "wrong_words": analysis_result["wrong_words"],
1034
- "feedback": feedback,
1035
- "processing_info": {
1036
- "model_used": model_info,
1037
- "mode": mode,
1038
- "character_based": True,
1039
- "language_model_correction": False,
1040
- "raw_output": True,
1041
- },
1042
- # Enhanced features
1043
- "reference_phonemes": reference_phoneme_string,
1044
- "phoneme_pairs": phoneme_pairs,
1045
- "phoneme_comparison": phoneme_comparison_summary,
1046
- "prosody_analysis": prosody_analysis,
1047
  }
1048
-
1049
- print("Enhanced assessment completed successfully")
1050
- return result
1051
-
1052
- def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
1053
- """Calculate overall pronunciation score"""
1054
- if not phoneme_comparisons:
1055
- return 0.0
1056
-
1057
- total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
1058
- return total_score / len(phoneme_comparisons)
1059
-
1060
- def _enhanced_phoneme_comparison(self, reference: str, learner: str) -> List[Dict]:
1061
- """Enhanced phoneme comparison using Levenshtein distance"""
1062
- import difflib
1063
 
1064
- # Split phoneme strings
1065
- ref_phones = reference.split()
1066
- learner_phones = learner.split()
 
1067
 
1068
- # Use SequenceMatcher for alignment
1069
- matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
1070
- comparisons = []
 
 
 
1071
 
1072
- for tag, i1, i2, j1, j2 in matcher.get_opcodes():
1073
- if tag == 'equal':
1074
- # Correct phonemes
1075
- for k in range(i2 - i1):
1076
- comparisons.append({
1077
- "position": len(comparisons),
1078
- "reference_phoneme": ref_phones[i1 + k],
1079
- "learner_phoneme": learner_phones[j1 + k],
1080
- "status": "correct",
1081
- "score": 1.0,
1082
- "difficulty": self.comparator.difficulty_map.get(ref_phones[i1 + k], 0.3),
1083
- })
1084
- elif tag == 'delete':
1085
- # Missing phonemes
1086
- for k in range(i1, i2):
1087
- comparisons.append({
1088
- "position": len(comparisons),
1089
- "reference_phoneme": ref_phones[k],
1090
- "learner_phoneme": "",
1091
- "status": "missing",
1092
- "score": 0.0,
1093
- "difficulty": self.comparator.difficulty_map.get(ref_phones[k], 0.3),
1094
- })
1095
- elif tag == 'insert':
1096
- # Extra phonemes
1097
- for k in range(j1, j2):
1098
- comparisons.append({
1099
- "position": len(comparisons),
1100
- "reference_phoneme": "",
1101
- "learner_phoneme": learner_phones[k],
1102
- "status": "extra",
1103
- "score": 0.0,
1104
- "difficulty": 0.3,
1105
- })
1106
- elif tag == 'replace':
1107
- # Substituted phonemes
1108
- max_len = max(i2 - i1, j2 - j1)
1109
- for k in range(max_len):
1110
- ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
1111
- learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
1112
-
1113
- if ref_phoneme and learner_phoneme:
1114
- # Both present - check if substitution is acceptable
1115
- if self.comparator._is_acceptable_substitution(ref_phoneme, learner_phoneme):
1116
- status = "acceptable"
1117
- score = 0.7
1118
- else:
1119
- status = "wrong"
1120
- score = 0.2
1121
- elif ref_phoneme and not learner_phoneme:
1122
- status = "missing"
1123
- score = 0.0
1124
- elif learner_phoneme and not ref_phoneme:
1125
- status = "extra"
1126
- score = 0.0
1127
- else:
1128
- continue
1129
-
1130
- comparisons.append({
1131
- "position": len(comparisons),
1132
- "reference_phoneme": ref_phoneme,
1133
- "learner_phoneme": learner_phoneme,
1134
- "status": status,
1135
- "score": score,
1136
- "difficulty": self.comparator.difficulty_map.get(ref_phoneme, 0.3),
1137
- })
1138
 
1139
- return comparisons
1140
 
1141
- def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
1142
- """Create phoneme pairs for visualization"""
1143
- ref_phones = reference.split()
1144
- learner_phones = learner.split()
1145
 
1146
- # Use SequenceMatcher for alignment
1147
- import difflib
1148
- matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
1149
 
1150
- pairs = []
1151
- for tag, i1, i2, j1, j2 in matcher.get_opcodes():
1152
- if tag == 'equal':
1153
- for k in range(i2 - i1):
1154
- pairs.append({
1155
- "reference": ref_phones[i1 + k],
1156
- "learner": learner_phones[j1 + k],
1157
- "match": True,
1158
- "type": "correct"
1159
- })
1160
- elif tag == 'replace':
1161
- max_len = max(i2 - i1, j2 - j1)
1162
- for k in range(max_len):
1163
- ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
1164
- learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
1165
- pairs.append({
1166
- "reference": ref_phoneme,
1167
- "learner": learner_phoneme,
1168
- "match": False,
1169
- "type": "substitution"
1170
- })
1171
- elif tag == 'delete':
1172
- for k in range(i1, i2):
1173
- pairs.append({
1174
- "reference": ref_phones[k],
1175
- "learner": "",
1176
- "match": False,
1177
- "type": "deletion"
1178
- })
1179
- elif tag == 'insert':
1180
- for k in range(j1, j2):
1181
- pairs.append({
1182
- "reference": "",
1183
- "learner": learner_phones[k],
1184
- "match": False,
1185
- "type": "insertion"
1186
- })
1187
 
1188
- return pairs
1189
 
1190
  def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
1191
- """Create a summary of phoneme comparison statistics"""
1192
  total = len(phoneme_pairs)
 
 
 
1193
  correct = sum(1 for pair in phoneme_pairs if pair["match"])
1194
  substitutions = sum(1 for pair in phoneme_pairs if pair["type"] == "substitution")
1195
  deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
@@ -1201,81 +1265,190 @@ class EnhancedPronunciationAssessor:
1201
  "substitutions": substitutions,
1202
  "deletions": deletions,
1203
  "insertions": insertions,
1204
- "accuracy_percentage": (correct / total * 100) if total > 0 else 0,
1205
- "error_rate": ((substitutions + deletions + insertions) / total * 100) if total > 0 else 0
1206
  }
1207
 
1208
- def _analyze_prosody(self, audio_path: str, reference_text: str) -> Dict:
1209
- """Analyze prosody features (pitch, rhythm, intensity)"""
1210
- try:
1211
- # Load audio file
1212
- import librosa
1213
- y, sr = librosa.load(audio_path, sr=16000)
1214
-
1215
- # Extract prosodic features
1216
- # Pitch analysis
1217
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
1218
- pitch_values = []
1219
- for i in range(pitches.shape[1]):
1220
- index = magnitudes[:, i].argmax()
1221
- pitch = pitches[index, i]
1222
- if pitch > 0: # Only consider non-zero pitch values
1223
- pitch_values.append(pitch)
1224
-
1225
- avg_pitch = float(np.mean(pitch_values)) if pitch_values else 0.0
1226
- pitch_variability = float(np.std(pitch_values)) if pitch_values else 0.0
1227
-
1228
- # Rhythm analysis (using zero-crossing rate as a proxy)
1229
- zcr = librosa.feature.zero_crossing_rate(y)
1230
- avg_zcr = float(np.mean(zcr))
1231
-
1232
- # Intensity analysis (RMS energy)
1233
- rms = librosa.feature.rms(y=y)
1234
- avg_rms = float(np.mean(rms))
1235
-
1236
- # Calculate speaking rate (words per minute)
1237
- duration = len(y) / sr # in seconds
1238
- word_count = len(reference_text.split())
1239
- speaking_rate = (word_count / duration) * 60 if duration > 0 else 0 # words per minute
 
 
1240
 
1241
- # Provide feedback based on prosodic features
1242
- prosody_feedback = []
1243
- if speaking_rate < 100:
1244
- prosody_feedback.append("Speaking rate is quite slow. Try to speak at a more natural pace.")
1245
- elif speaking_rate > 200:
1246
- prosody_feedback.append("Speaking rate is quite fast. Try to slow down for better clarity.")
1247
- else:
1248
- prosody_feedback.append("Speaking rate is good.")
1249
-
1250
- if pitch_variability < 50:
1251
- prosody_feedback.append("Pitch variability is low. Try to use more intonation to make speech more expressive.")
1252
- else:
1253
- prosody_feedback.append("Good pitch variability, which makes speech more engaging.")
1254
-
1255
- return {
1256
- "pitch": {
1257
- "average": avg_pitch,
1258
- "variability": pitch_variability
1259
- },
1260
- "rhythm": {
1261
- "zero_crossing_rate": avg_zcr
1262
- },
1263
- "intensity": {
1264
- "rms_energy": avg_rms
1265
- },
1266
- "speaking_rate": {
1267
- "words_per_minute": speaking_rate,
1268
- "duration_seconds": duration
1269
- },
1270
- "feedback": prosody_feedback
 
 
 
 
 
 
 
 
 
 
 
 
1271
  }
1272
- except Exception as e:
1273
- print(f"Prosody analysis error: {e}")
1274
- return {
1275
- "error": f"Prosody analysis failed: {str(e)}",
1276
- "pitch": {"average": 0, "variability": 0},
1277
- "rhythm": {"zero_crossing_rate": 0},
1278
- "intensity": {"rms_energy": 0},
1279
- "speaking_rate": {"words_per_minute": 0, "duration_seconds": 0},
1280
- "feedback": ["Prosody analysis unavailable"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Optional
2
  import numpy as np
3
  import librosa
4
  import nltk
5
  import eng_to_ipa as ipa
 
6
  import re
7
  from collections import defaultdict
 
 
8
  from loguru import logger
9
  import time
10
+ import Levenshtein
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
  from src.AI_Models.wave2vec_inference import (
14
  Wave2Vec2Inference,
15
  Wave2Vec2ONNXInference,
16
+ OptimizedWave2Vec2Factory,
17
  export_to_onnx,
18
  )
19
 
 
25
  print("Warning: NLTK data not available")
26
 
27
 
28
+ class AssessmentMode(Enum):
29
+ WORD = "word"
30
+ SENTENCE = "sentence"
31
+ AUTO = "auto"
32
+
33
+
34
+ class ErrorType(Enum):
35
+ CORRECT = "correct"
36
+ SUBSTITUTION = "substitution"
37
+ DELETION = "deletion"
38
+ INSERTION = "insertion"
39
+ ACCEPTABLE = "acceptable"
40
+
41
+
42
+ @dataclass
43
+ class CharacterError:
44
+ """Character-level error information for UI mapping"""
45
+ character: str
46
+ position: int
47
+ error_type: str
48
+ expected_sound: str
49
+ actual_sound: str
50
+ severity: float
51
+ color: str
52
+
53
+
54
+ class EnhancedWav2Vec2CharacterASR:
55
+ """Enhanced Wav2Vec2 ASR with prosody analysis support"""
56
 
57
  def __init__(
58
  self,
 
60
  onnx: bool = False,
61
  quantized: bool = False,
62
  ):
 
 
 
 
 
 
 
 
63
  self.use_onnx = onnx
64
  self.sample_rate = 16000
65
  self.model_name = model_name
66
+
67
  if onnx:
68
  import os
69
+ model_path = f"wav2vec2-large-960h-lv60-self{'.quant' if quantized else ''}.onnx"
70
+ if not os.path.exists(model_path):
 
 
 
 
 
71
  export_to_onnx(model_name, quantize=quantized)
72
+
73
+ # Use factory to create safe inference instance
74
+ self.model = OptimizedWave2Vec2Factory.create_optimized_inference(
75
+ model_name,
76
+ onnx_path=model_path if onnx else None,
77
+ safe_mode=True # Use safe mode to avoid optimization issues
 
 
 
78
  )
79
 
80
+ def transcribe_with_features(self, audio_path: str) -> Dict:
81
+ """Enhanced transcription with audio features for prosody analysis"""
82
  try:
83
  start_time = time.time()
84
+
85
+ # Basic transcription
86
  character_transcript = self.model.file_to_text(audio_path)
87
+ character_transcript = self._clean_character_transcript(character_transcript)
88
+
89
+ # Convert to phonemes
90
+ phoneme_representation = self._characters_to_phoneme_representation(character_transcript)
91
+
92
+ # Extract audio features for prosody
93
+ audio_features = self._extract_enhanced_audio_features(audio_path)
94
+
95
+ logger.info(f"Enhanced transcription time: {time.time() - start_time:.2f}s")
96
+
97
  return {
98
  "character_transcript": character_transcript,
99
+ "phoneme_representation": phoneme_representation,
100
+ "audio_features": audio_features,
101
+ "confidence": self._estimate_confidence(character_transcript)
102
  }
103
+
104
  except Exception as e:
105
+ logger.error(f"Enhanced ASR error: {e}")
106
  return self._empty_result()
107
 
108
+ def _extract_enhanced_audio_features(self, audio_path: str) -> Dict:
109
+ """Extract comprehensive audio features for prosody analysis"""
110
+ try:
111
+ y, sr = librosa.load(audio_path, sr=self.sample_rate)
112
+ duration = len(y) / sr
113
+
114
+ # Pitch analysis
115
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
116
+ pitch_values = []
117
+ for t in range(pitches.shape[1]):
118
+ index = magnitudes[:, t].argmax()
119
+ pitch = pitches[index, t]
120
+ if pitch > 0:
121
+ pitch_values.append(pitch)
122
+
123
+ # Rhythm and timing features
124
+ tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
125
+
126
+ # Intensity features
127
+ rms = librosa.feature.rms(y=y)[0]
128
+ zcr = librosa.feature.zero_crossing_rate(y)[0]
129
+
130
+ # Spectral features
131
+ spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
132
+
133
+ return {
134
+ "duration": duration,
135
+ "pitch": {
136
+ "values": pitch_values,
137
+ "mean": np.mean(pitch_values) if pitch_values else 0,
138
+ "std": np.std(pitch_values) if pitch_values else 0,
139
+ "range": np.max(pitch_values) - np.min(pitch_values) if pitch_values else 0,
140
+ "cv": np.std(pitch_values) / np.mean(pitch_values) if pitch_values and np.mean(pitch_values) > 0 else 0
141
+ },
142
+ "rhythm": {
143
+ "tempo": tempo,
144
+ "beats_per_second": len(beats) / duration if duration > 0 else 0
145
+ },
146
+ "intensity": {
147
+ "rms_mean": np.mean(rms),
148
+ "rms_std": np.std(rms),
149
+ "zcr_mean": np.mean(zcr)
150
+ },
151
+ "spectral": {
152
+ "centroid_mean": np.mean(spectral_centroids),
153
+ "centroid_std": np.std(spectral_centroids)
154
+ }
155
+ }
156
+
157
+ except Exception as e:
158
+ logger.error(f"Audio feature extraction error: {e}")
159
+ return {"duration": 0, "error": str(e)}
160
 
161
  def _clean_character_transcript(self, transcript: str) -> str:
162
  """Clean and standardize character transcript"""
 
163
  logger.info(f"Raw transcript before cleaning: {transcript}")
164
+ cleaned = re.sub(r'\s+', ' ', transcript)
165
+ return cleaned.strip().lower()
 
166
 
167
  def _characters_to_phoneme_representation(self, text: str) -> str:
168
+ """Convert character-based transcript to phoneme representation"""
169
  if not text:
170
  return ""
171
+
172
  words = text.split()
173
  phoneme_words = []
174
+ g2p = EnhancedG2P()
175
+
176
  for word in words:
177
  try:
178
  if g2p:
179
+ word_phonemes = g2p.word_to_phonemes(word)
180
+ phoneme_words.extend(word_phonemes)
181
  else:
182
  phoneme_words.extend(self._simple_letter_to_phoneme(word))
183
  except:
 
184
  phoneme_words.extend(self._simple_letter_to_phoneme(word))
185
+
186
  return " ".join(phoneme_words)
187
 
188
  def _simple_letter_to_phoneme(self, word: str) -> List[str]:
189
+ """Fallback letter-to-phoneme conversion"""
190
  letter_to_phoneme = {
191
+ "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f",
192
+ "g": "ɡ", "h": "h", "i": "ɪ", "j": "dʒ", "k": "k", "l": "l",
193
+ "m": "m", "n": "n", "o": "ʌ", "p": "p", "q": "k", "r": "r",
194
+ "s": "s", "t": "t", "u": "ʌ", "v": "v", "w": "w", "x": "ks",
195
+ "y": "j", "z": "z"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  }
197
+
198
+ return [letter_to_phoneme.get(letter, letter) for letter in word.lower() if letter in letter_to_phoneme]
199
 
200
+ def _estimate_confidence(self, transcript: str) -> float:
201
+ """Estimate transcription confidence"""
202
+ if not transcript or len(transcript.strip()) < 2:
203
+ return 0.0
204
+
205
+ repeated_chars = len(re.findall(r'(.)\1{2,}', transcript))
206
+ return max(0.0, 1.0 - (repeated_chars * 0.2))
207
 
208
  def _empty_result(self) -> Dict:
209
+ """Empty result for error cases"""
210
  return {
211
  "character_transcript": "",
212
  "phoneme_representation": "",
213
+ "audio_features": {"duration": 0},
214
+ "confidence": 0.0
215
  }
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ class EnhancedG2P:
219
+ """Enhanced Grapheme-to-Phoneme converter with visualization support"""
220
 
221
  def __init__(self):
222
  try:
223
  self.cmu_dict = cmudict.dict()
224
  except:
225
  self.cmu_dict = {}
226
+ logger.warning("CMU dictionary not available")
227
+
228
+ # Vietnamese speaker substitution patterns (enhanced)
229
+ self.vn_substitutions = {
230
+ "θ": ["f", "s", "t", "d"],
231
+ "ð": ["d", "z", "v", "t"],
232
+ "v": ["w", "f", "b"],
233
+ "w": ["v", "b"],
234
+ "r": ["l", "n"],
235
+ "l": ["r", "n"],
236
+ "z": ["s", "j"],
237
+ "ʒ": ["ʃ", "z", "s"],
238
+ "ʃ": ["s", "ʒ"],
239
+ "ŋ": ["n", "m"],
240
+ "tʃ": ["ʃ", "s", "k"],
241
+ "dʒ": ["ʒ", "j", "g"],
242
+ "æ": ["ɛ", "a"],
243
+ "ɪ": ["i"],
244
+ "ʊ": ["u"]
245
+ }
246
+
247
+ # Difficulty scores for Vietnamese speakers
248
+ self.difficulty_scores = {
249
+ "θ": 0.9, "ð": 0.9, "v": 0.8, "z": 0.8, "ʒ": 0.9,
250
+ "r": 0.7, "l": 0.6, "w": 0.5, "æ": 0.7, "ɪ": 0.6,
251
+ "ʊ": 0.6, "ŋ": 0.3, "f": 0.2, "s": 0.2, "ʃ": 0.5,
252
+ "tʃ": 0.4, "dʒ": 0.5
253
+ }
254
+
255
+ def word_to_phonemes(self, word: str) -> List[str]:
256
+ """Convert word to phoneme list"""
257
+ word_lower = word.lower().strip()
258
+
259
+ if word_lower in self.cmu_dict:
260
+ cmu_phonemes = self.cmu_dict[word_lower][0]
261
+ return self._convert_cmu_to_ipa(cmu_phonemes)
262
+ else:
263
+ return self._estimate_phonemes(word_lower)
264
+
265
+ def get_phoneme_string(self, text: str) -> str:
266
+ """Get space-separated phoneme string"""
267
+ words = self._clean_text(text).split()
268
+ all_phonemes = []
269
+
270
+ for word in words:
271
+ if word:
272
+ phonemes = self.word_to_phonemes(word)
273
+ all_phonemes.extend(phonemes)
274
+
275
+ return " ".join(all_phonemes)
276
 
277
  def text_to_phonemes(self, text: str) -> List[Dict]:
278
+ """Convert text to phoneme sequence with visualization data"""
279
  words = self._clean_text(text).split()
280
  phoneme_sequence = []
281
 
282
  for word in words:
283
+ word_phonemes = self.word_to_phonemes(word)
284
+ phoneme_sequence.append({
285
+ "word": word,
286
+ "phonemes": word_phonemes,
287
+ "ipa": self._get_ipa(word),
288
+ "phoneme_string": " ".join(word_phonemes),
289
+ "visualization": self._create_phoneme_visualization(word_phonemes)
290
+ })
 
291
 
292
  return phoneme_sequence
293
 
294
+ def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
295
+ """Convert CMU phonemes to IPA"""
296
+ cmu_to_ipa = {
297
+ "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ",
298
+ "AY": "aɪ", "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ",
299
+ "IY": "i", "OW": "oʊ", "OY": "ɔɪ", "UH": "ʊ", "UW": "u",
300
+ "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "F": "f",
301
+ "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l",
302
+ "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
303
+ "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v",
304
+ "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  }
306
+
307
+ ipa_phonemes = []
308
  for phoneme in cmu_phonemes:
309
+ clean_phoneme = re.sub(r'[0-9]', '', phoneme)
310
+ ipa_phoneme = cmu_to_ipa.get(clean_phoneme, clean_phoneme.lower())
311
+ ipa_phonemes.append(ipa_phoneme)
312
+
313
+ return ipa_phonemes
 
 
 
 
 
 
314
 
315
  def _estimate_phonemes(self, word: str) -> List[str]:
316
  """Estimate phonemes for unknown words"""
 
317
  phoneme_map = {
318
+ "ch": "tʃ", "sh": "ʃ", "th": "θ", "ph": "f", "ck": "k",
319
+ "ng": "ŋ", "qu": "kw", "a": "æ", "e": "ɛ", "i": "ɪ",
320
+ "o": "ʌ", "u": "ʌ", "b": "b", "c": "k", "d": "d",
321
+ "f": "f", "g": "ɡ", "h": "h", "j": "dʒ", "k": "k",
322
+ "l": "l", "m": "m", "n": "n", "p": "p", "r": "r",
323
+ "s": "s", "t": "t", "v": "v", "w": "w", "x": "ks",
324
+ "y": "j", "z": "z"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  }
326
+
 
327
  phonemes = []
328
  i = 0
 
329
  while i < len(word):
 
330
  if i <= len(word) - 2:
331
+ two_char = word[i:i+2]
332
  if two_char in phoneme_map:
333
+ phonemes.append(phoneme_map[two_char])
334
  i += 2
335
  continue
336
+
 
337
  char = word[i]
338
  if char in phoneme_map:
339
+ phonemes.append(phoneme_map[char])
 
340
  i += 1
341
+
342
  return phonemes
343
 
344
+ def _clean_text(self, text: str) -> str:
345
+ """Clean text for processing"""
346
+ text = re.sub(r"[^\w\s']", " ", text)
347
+ text = re.sub(r'\s+', ' ', text)
348
+ return text.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ def _get_ipa(self, word: str) -> str:
351
+ """Get IPA transcription"""
352
+ try:
353
+ return ipa.convert(word)
354
+ except:
355
+ return f"/{word}/"
356
 
357
  def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
358
  """Create visualization data for phonemes"""
359
  visualization = []
360
  for phoneme in phonemes:
 
361
  color_category = self._get_phoneme_color_category(phoneme)
362
  visualization.append({
363
  "phoneme": phoneme,
364
  "color_category": color_category,
365
+ "description": self._get_phoneme_description(phoneme),
366
+ "difficulty": self.difficulty_scores.get(phoneme, 0.3)
367
  })
368
  return visualization
369
 
370
  def _get_phoneme_color_category(self, phoneme: str) -> str:
371
  """Categorize phonemes by color for visualization"""
372
  vowel_phonemes = {"ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u"}
373
+ difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
  if phoneme in vowel_phonemes:
376
  return "vowel"
377
+ elif phoneme in difficult_consonants:
378
+ return "difficult"
379
  else:
380
+ return "consonant"
381
 
382
  def _get_phoneme_description(self, phoneme: str) -> str:
383
  """Get description for a phoneme"""
384
  descriptions = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  "θ": "Voiceless dental fricative (like 'th' in 'think')",
386
  "ð": "Voiced dental fricative (like 'th' in 'this')",
387
+ "v": "Voiced labiodental fricative (like 'v' in 'van')",
388
  "z": "Voiced alveolar fricative (like 'z' in 'zip')",
 
389
  "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
 
 
 
 
390
  "r": "Alveolar approximant (like 'r' in 'red')",
391
  "w": "Labial-velar approximant (like 'w' in 'wet')",
392
+ "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
393
+ "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
394
+ "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')"
395
  }
396
  return descriptions.get(phoneme, f"Phoneme: {phoneme}")
397
 
398
+ def is_acceptable_substitution(self, reference: str, predicted: str) -> bool:
399
+ """Check if substitution is acceptable for Vietnamese speakers"""
400
+ acceptable = self.vn_substitutions.get(reference, [])
401
+ return predicted in acceptable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
+ def get_difficulty_score(self, phoneme: str) -> float:
404
+ """Get difficulty score for phoneme"""
405
+ return self.difficulty_scores.get(phoneme, 0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
 
 
 
 
407
 
408
+ class AdvancedPhonemeComparator:
409
+ """Enhanced phoneme comparator using Levenshtein distance"""
 
410
 
411
+ def __init__(self):
412
+ self.g2p = EnhancedG2P()
413
 
414
+ def compare_with_levenshtein(self, reference: str, predicted: str) -> List[Dict]:
415
+ """Compare phonemes using Levenshtein distance for accurate alignment"""
416
+ ref_phones = reference.split() if reference else []
417
+ pred_phones = predicted.split() if predicted else []
418
+
419
+ if not ref_phones:
420
+ return []
421
+
422
+ # Use Levenshtein editops for precise alignment
423
+ ops = Levenshtein.editops(ref_phones, pred_phones)
424
+
425
  comparisons = []
426
+ ref_idx = 0
427
+ pred_idx = 0
428
+
429
+ # Process equal parts first
430
+ for op_type, ref_pos, pred_pos in ops:
431
+ # Add equal characters before this operation
432
+ while ref_idx < ref_pos and pred_idx < pred_pos:
433
+ comparison = self._create_comparison(
434
+ ref_phones[ref_idx], pred_phones[pred_idx],
435
+ ErrorType.CORRECT, 1.0, len(comparisons)
436
+ )
437
+ comparisons.append(comparison)
438
+ ref_idx += 1
439
+ pred_idx += 1
440
+
441
+ # Process the operation
442
+ if op_type == 'replace':
443
+ ref_phoneme = ref_phones[ref_pos]
444
+ pred_phoneme = pred_phones[pred_pos]
445
+
446
+ if self.g2p.is_acceptable_substitution(ref_phoneme, pred_phoneme):
447
+ error_type = ErrorType.ACCEPTABLE
448
  score = 0.7
449
  else:
450
+ error_type = ErrorType.SUBSTITUTION
451
  score = 0.2
452
+
453
+ comparison = self._create_comparison(
454
+ ref_phoneme, pred_phoneme, error_type, score, len(comparisons)
455
+ )
456
+ comparisons.append(comparison)
457
+ ref_idx = ref_pos + 1
458
+ pred_idx = pred_pos + 1
459
+
460
+ elif op_type == 'delete':
461
+ comparison = self._create_comparison(
462
+ ref_phones[ref_pos], "", ErrorType.DELETION, 0.0, len(comparisons)
463
+ )
464
+ comparisons.append(comparison)
465
+ ref_idx = ref_pos + 1
466
+
467
+ elif op_type == 'insert':
468
+ comparison = self._create_comparison(
469
+ "", pred_phones[pred_pos], ErrorType.INSERTION, 0.0, len(comparisons)
470
+ )
471
+ comparisons.append(comparison)
472
+ pred_idx = pred_pos + 1
473
+
474
+ # Add remaining equal characters
475
+ while ref_idx < len(ref_phones) and pred_idx < len(pred_phones):
476
+ comparison = self._create_comparison(
477
+ ref_phones[ref_idx], pred_phones[pred_idx],
478
+ ErrorType.CORRECT, 1.0, len(comparisons)
479
+ )
480
  comparisons.append(comparison)
481
+ ref_idx += 1
482
+ pred_idx += 1
483
+
484
  return comparisons
485
 
486
+ def _create_comparison(self, ref_phoneme: str, pred_phoneme: str,
487
+ error_type: ErrorType, score: float, position: int) -> Dict:
488
+ """Create comparison dictionary"""
489
+ return {
490
+ "position": position,
491
+ "reference_phoneme": ref_phoneme,
492
+ "learner_phoneme": pred_phoneme,
493
+ "status": error_type.value,
494
+ "score": score,
495
+ "difficulty": self.g2p.get_difficulty_score(ref_phoneme),
496
+ "error_type": error_type.value
497
+ }
498
 
499
 
500
+ class EnhancedWordAnalyzer:
501
+ """Enhanced word analyzer with character-level error mapping"""
502
 
503
  def __init__(self):
504
+ self.g2p = EnhancedG2P()
505
+ self.comparator = AdvancedPhonemeComparator()
 
 
 
506
 
507
+ def analyze_words_enhanced(self, reference_text: str, learner_phonemes: str,
508
+ mode: AssessmentMode) -> Dict:
509
+ """Enhanced word analysis with character-level mapping"""
510
+
511
  # Get reference phonemes by word
512
  reference_words = self.g2p.text_to_phonemes(reference_text)
513
+
514
+ # Get overall phoneme comparison using Levenshtein
515
+ reference_phoneme_string = self.g2p.get_phoneme_string(reference_text)
516
+ phoneme_comparisons = self.comparator.compare_with_levenshtein(
517
  reference_phoneme_string, learner_phonemes
518
  )
519
+
520
+ # Create enhanced word highlights
521
+ word_highlights = self._create_enhanced_word_highlights(
522
+ reference_words, phoneme_comparisons, mode
523
  )
524
+
525
+ # Identify wrong words with character-level errors
526
+ wrong_words = self._identify_wrong_words_enhanced(word_highlights, phoneme_comparisons)
527
+
528
  return {
529
  "word_highlights": word_highlights,
530
  "phoneme_differences": phoneme_comparisons,
531
  "wrong_words": wrong_words,
532
+ "reference_phonemes": reference_phoneme_string,
533
+ "phoneme_pairs": self._create_phoneme_pairs(reference_phoneme_string, learner_phonemes)
534
  }
535
 
536
+ def _create_enhanced_word_highlights(self, reference_words: List[Dict],
537
+ phoneme_comparisons: List[Dict],
538
+ mode: AssessmentMode) -> List[Dict]:
539
+ """Create enhanced word highlights with character-level error mapping"""
540
+
541
  word_highlights = []
542
  phoneme_index = 0
543
 
 
548
 
549
  # Get phoneme scores for this word
550
  word_phoneme_scores = []
551
+ word_comparisons = []
552
+
553
  for j in range(num_phonemes):
554
  if phoneme_index + j < len(phoneme_comparisons):
555
  comparison = phoneme_comparisons[phoneme_index + j]
556
  word_phoneme_scores.append(comparison["score"])
557
+ word_comparisons.append(comparison)
558
 
559
  # Calculate word score
560
  word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
561
 
562
+ # Map phoneme errors to character positions (enhanced for word mode)
563
+ character_errors = []
564
+ if mode == AssessmentMode.WORD:
565
+ character_errors = self._map_phonemes_to_characters(word, word_comparisons)
566
+
567
+ # Create enhanced word highlight
568
  highlight = {
569
  "word": word,
570
  "score": float(word_score),
 
575
  "phoneme_scores": word_phoneme_scores,
576
  "phoneme_start_index": phoneme_index,
577
  "phoneme_end_index": phoneme_index + num_phonemes - 1,
578
+ "phoneme_visualization": word_data["visualization"],
579
+ "character_errors": character_errors, # New feature
580
+ "detailed_analysis": mode == AssessmentMode.WORD # Flag for UI
581
  }
582
 
583
  word_highlights.append(highlight)
 
585
 
586
  return word_highlights
587
 
588
+ def _map_phonemes_to_characters(self, word: str, phoneme_comparisons: List[Dict]) -> List[CharacterError]:
589
+ """Map phoneme errors to character positions in word"""
590
+ character_errors = []
591
+
592
+ # Simple mapping strategy: distribute phonemes across characters
593
+ if not phoneme_comparisons or not word:
594
+ return character_errors
595
+
596
+ chars_per_phoneme = len(word) / len(phoneme_comparisons)
597
+
598
+ for i, comparison in enumerate(phoneme_comparisons):
599
+ if comparison["status"] in ["substitution", "deletion", "wrong"]:
600
+ # Calculate character position
601
+ char_pos = min(int(i * chars_per_phoneme), len(word) - 1)
602
+
603
+ severity = 1.0 - comparison["score"]
604
+ color = self._get_error_color(severity)
605
+
606
+ error = CharacterError(
607
+ character=word[char_pos],
608
+ position=char_pos,
609
+ error_type=comparison["status"],
610
+ expected_sound=comparison["reference_phoneme"],
611
+ actual_sound=comparison["learner_phoneme"],
612
+ severity=severity,
613
+ color=color
614
+ )
615
+ character_errors.append(error)
616
+
617
+ return character_errors
618
+
619
+ def _get_error_color(self, severity: float) -> str:
620
+ """Get color code for character errors"""
621
+ if severity >= 0.8:
622
+ return "#ef4444" # Red - severe error
623
+ elif severity >= 0.6:
624
+ return "#f97316" # Orange - moderate error
625
+ elif severity >= 0.4:
626
+ return "#eab308" # Yellow - mild error
627
+ else:
628
+ return "#84cc16" # Light green - minor error
629
 
630
+ def _identify_wrong_words_enhanced(self, word_highlights: List[Dict],
631
+ phoneme_comparisons: List[Dict]) -> List[Dict]:
632
+ """Enhanced wrong word identification with detailed error analysis"""
633
+
634
  wrong_words = []
635
 
636
  for word_highlight in word_highlights:
637
+ if word_highlight["score"] < 0.6:
 
 
638
  start_idx = word_highlight["phoneme_start_index"]
639
  end_idx = word_highlight["phoneme_end_index"]
640
 
 
644
  for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
645
  comparison = phoneme_comparisons[i]
646
 
647
+ if comparison["status"] in ["wrong", "substitution"]:
648
+ wrong_phonemes.append({
649
+ "expected": comparison["reference_phoneme"],
650
+ "actual": comparison["learner_phoneme"],
651
+ "difficulty": comparison["difficulty"],
652
+ "description": self.g2p._get_phoneme_description(comparison["reference_phoneme"])
653
+ })
654
+ elif comparison["status"] in ["missing", "deletion"]:
655
+ missing_phonemes.append({
656
+ "phoneme": comparison["reference_phoneme"],
657
+ "difficulty": comparison["difficulty"],
658
+ "description": self.g2p._get_phoneme_description(comparison["reference_phoneme"])
659
+ })
 
 
 
 
660
 
661
  wrong_word = {
662
  "word": word_highlight["word"],
 
665
  "ipa": word_highlight["ipa"],
666
  "wrong_phonemes": wrong_phonemes,
667
  "missing_phonemes": missing_phonemes,
668
+ "tips": self._get_enhanced_vietnamese_tips(wrong_phonemes, missing_phonemes),
669
+ "phoneme_visualization": word_highlight["phoneme_visualization"],
670
+ "character_errors": word_highlight.get("character_errors", [])
671
  }
672
 
673
  wrong_words.append(wrong_word)
674
 
675
  return wrong_words
676
 
677
+ def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
678
+ """Create phoneme pairs for visualization"""
679
+ ref_phones = reference.split() if reference else []
680
+ learner_phones = learner.split() if learner else []
681
+
682
+ # Use difflib for alignment visualization
683
+ import difflib
684
+ matcher = difflib.SequenceMatcher(None, ref_phones, learner_phones)
685
+
686
+ pairs = []
687
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
688
+ if tag == 'equal':
689
+ for k in range(i2 - i1):
690
+ pairs.append({
691
+ "reference": ref_phones[i1 + k],
692
+ "learner": learner_phones[j1 + k],
693
+ "match": True,
694
+ "type": "correct"
695
+ })
696
+ elif tag == 'replace':
697
+ max_len = max(i2 - i1, j2 - j1)
698
+ for k in range(max_len):
699
+ ref_phoneme = ref_phones[i1 + k] if i1 + k < i2 else ""
700
+ learner_phoneme = learner_phones[j1 + k] if j1 + k < j2 else ""
701
+ pairs.append({
702
+ "reference": ref_phoneme,
703
+ "learner": learner_phoneme,
704
+ "match": False,
705
+ "type": "substitution"
706
+ })
707
+ elif tag == 'delete':
708
+ for k in range(i1, i2):
709
+ pairs.append({
710
+ "reference": ref_phones[k],
711
+ "learner": "",
712
+ "match": False,
713
+ "type": "deletion"
714
+ })
715
+ elif tag == 'insert':
716
+ for k in range(j1, j2):
717
+ pairs.append({
718
+ "reference": "",
719
+ "learner": learner_phones[k],
720
+ "match": False,
721
+ "type": "insertion"
722
+ })
723
+
724
+ return pairs
725
+
726
  def _get_word_status(self, score: float) -> str:
727
  """Get word status from score"""
728
  if score >= 0.8:
 
745
  else:
746
  return "#ef4444" # Red
747
 
748
+ def _get_enhanced_vietnamese_tips(self, wrong_phonemes: List[Dict],
749
+ missing_phonemes: List[Dict]) -> List[str]:
750
+ """Enhanced Vietnamese-specific pronunciation tips"""
 
 
751
  tips = []
752
 
 
753
  vietnamese_tips = {
754
  "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
755
  "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
 
759
  "z": "Giống âm 's' nhưng có rung dây thanh âm",
760
  "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
761
  "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
762
+ "æ": "Mở miệng rộng hơn khi phát âm 'a'",
763
+ "ɪ": "Âm 'i' ngắn, không kéo dài như tiếng Việt"
764
  }
765
 
 
766
  for wrong in wrong_phonemes:
767
  expected = wrong["expected"]
 
 
768
  if expected in vietnamese_tips:
769
+ tips.append(f"Âm /{expected}/: {vietnamese_tips[expected]}")
 
 
770
 
 
771
  for missing in missing_phonemes:
772
  phoneme = missing["phoneme"]
773
  if phoneme in vietnamese_tips:
774
+ tips.append(f"Thiếu âm /{phoneme}/: {vietnamese_tips[phoneme]}")
775
 
776
  return tips
777
 
778
 
779
+ class EnhancedProsodyAnalyzer:
780
+ """Enhanced prosody analyzer for sentence-level assessment"""
781
 
782
+ def __init__(self):
783
+ # Expected values for English prosody
784
+ self.expected_speech_rate = 4.0 # syllables per second
785
+ self.expected_pitch_range = 100 # Hz
786
+ self.expected_pitch_cv = 0.3 # coefficient of variation
 
 
 
 
787
 
788
+ def analyze_prosody_enhanced(self, audio_features: Dict, reference_text: str) -> Dict:
789
+ """Enhanced prosody analysis with detailed scoring"""
790
+
791
+ if "error" in audio_features:
792
+ return self._empty_prosody_result()
793
+
794
+ duration = audio_features.get("duration", 1)
795
+ pitch_data = audio_features.get("pitch", {})
796
+ rhythm_data = audio_features.get("rhythm", {})
797
+ intensity_data = audio_features.get("intensity", {})
798
+
799
+ # Calculate syllables
800
+ num_syllables = self._estimate_syllables(reference_text)
801
+ actual_speech_rate = num_syllables / duration if duration > 0 else 0
802
+
803
+ # Calculate individual prosody scores
804
+ pace_score = self._calculate_pace_score(actual_speech_rate)
805
+ intonation_score = self._calculate_intonation_score(pitch_data)
806
+ rhythm_score = self._calculate_rhythm_score(rhythm_data, intensity_data)
807
+ stress_score = self._calculate_stress_score(pitch_data, intensity_data)
808
+
809
+ # Overall prosody score
810
+ overall_prosody = (pace_score + intonation_score + rhythm_score + stress_score) / 4
811
+
812
+ # Generate prosody feedback
813
+ feedback = self._generate_prosody_feedback(
814
+ pace_score, intonation_score, rhythm_score, stress_score,
815
+ actual_speech_rate, pitch_data
816
+ )
817
+
818
+ return {
819
+ "pace_score": pace_score,
820
+ "intonation_score": intonation_score,
821
+ "rhythm_score": rhythm_score,
822
+ "stress_score": stress_score,
823
+ "overall_prosody": overall_prosody,
824
+ "details": {
825
+ "speech_rate": actual_speech_rate,
826
+ "expected_speech_rate": self.expected_speech_rate,
827
+ "syllable_count": num_syllables,
828
+ "duration": duration,
829
+ "pitch_analysis": pitch_data,
830
+ "rhythm_analysis": rhythm_data,
831
+ "intensity_analysis": intensity_data
832
+ },
833
+ "feedback": feedback
834
+ }
835
+
836
+ def _calculate_pace_score(self, actual_rate: float) -> float:
837
+ """Calculate pace score based on speech rate"""
838
+ if self.expected_speech_rate == 0:
839
+ return 0.5
840
+
841
+ ratio = actual_rate / self.expected_speech_rate
842
+
843
+ if 0.8 <= ratio <= 1.2:
844
+ return 1.0
845
+ elif 0.6 <= ratio < 0.8 or 1.2 < ratio <= 1.5:
846
+ return 0.7
847
+ elif 0.4 <= ratio < 0.6 or 1.5 < ratio <= 2.0:
848
+ return 0.4
849
  else:
850
+ return 0.1
851
 
852
+ def _calculate_intonation_score(self, pitch_data: Dict) -> float:
853
+ """Calculate intonation score based on pitch variation"""
854
+ pitch_range = pitch_data.get("range", 0)
855
+
856
+ if self.expected_pitch_range == 0:
857
+ return 0.5
858
+
859
+ ratio = pitch_range / self.expected_pitch_range
860
+
861
+ if 0.7 <= ratio <= 1.3:
862
+ return 1.0
863
+ elif 0.5 <= ratio < 0.7 or 1.3 < ratio <= 1.8:
864
+ return 0.7
865
+ elif 0.3 <= ratio < 0.5 or 1.8 < ratio <= 2.5:
866
+ return 0.4
867
+ else:
868
+ return 0.2
869
+
870
+ def _calculate_rhythm_score(self, rhythm_data: Dict, intensity_data: Dict) -> float:
871
+ """Calculate rhythm score based on tempo and intensity patterns"""
872
+ tempo = rhythm_data.get("tempo", 120)
873
+ intensity_std = intensity_data.get("rms_std", 0)
874
+ intensity_mean = intensity_data.get("rms_mean", 0)
875
+
876
+ # Tempo score (60-180 BPM is good for speech)
877
+ if 60 <= tempo <= 180:
878
+ tempo_score = 1.0
879
+ elif 40 <= tempo < 60 or 180 < tempo <= 220:
880
+ tempo_score = 0.6
881
+ else:
882
+ tempo_score = 0.3
883
+
884
+ # Intensity consistency score
885
+ if intensity_mean > 0:
886
+ intensity_consistency = max(0, 1.0 - (intensity_std / intensity_mean))
887
+ else:
888
+ intensity_consistency = 0.5
889
+
890
+ return (tempo_score + intensity_consistency) / 2
891
+
892
+ def _calculate_stress_score(self, pitch_data: Dict, intensity_data: Dict) -> float:
893
+ """Calculate stress score based on pitch and intensity variation"""
894
+ pitch_cv = pitch_data.get("cv", 0)
895
+ intensity_std = intensity_data.get("rms_std", 0)
896
+ intensity_mean = intensity_data.get("rms_mean", 0)
897
+
898
+ # Pitch coefficient of variation score
899
+ if 0.2 <= pitch_cv <= 0.4:
900
+ pitch_score = 1.0
901
+ elif 0.1 <= pitch_cv < 0.2 or 0.4 < pitch_cv <= 0.6:
902
+ pitch_score = 0.7
903
+ else:
904
+ pitch_score = 0.4
905
+
906
+ # Intensity variation score
907
+ if intensity_mean > 0:
908
+ intensity_cv = intensity_std / intensity_mean
909
+ if 0.1 <= intensity_cv <= 0.3:
910
+ intensity_score = 1.0
911
+ elif 0.05 <= intensity_cv < 0.1 or 0.3 < intensity_cv <= 0.5:
912
+ intensity_score = 0.7
913
  else:
914
+ intensity_score = 0.4
915
+ else:
916
+ intensity_score = 0.5
917
+
918
+ return (pitch_score + intensity_score) / 2
919
 
920
+ def _generate_prosody_feedback(self, pace_score: float, intonation_score: float,
921
+ rhythm_score: float, stress_score: float,
922
+ speech_rate: float, pitch_data: Dict) -> List[str]:
923
+ """Generate detailed prosody feedback"""
924
+ feedback = []
925
+
926
+ if pace_score < 0.5:
927
+ if speech_rate < self.expected_speech_rate * 0.8:
928
+ feedback.append("Tốc độ nói hơi chậm, thử nói nhanh hơn một chút")
929
+ else:
930
+ feedback.append("Tốc độ nói hơi nhanh, thử nói chậm lại để rõ ràng hơn")
931
+ elif pace_score >= 0.8:
932
+ feedback.append("Tốc độ nói rất tự nhiên")
933
+
934
+ if intonation_score < 0.5:
935
+ feedback.append("Cần cải thiện ngữ điệu - thay đổi cao độ giọng nhiều hơn")
936
+ elif intonation_score >= 0.8:
937
+ feedback.append("Ngữ điệu rất tự nhiên và sinh động")
938
+
939
+ if rhythm_score < 0.5:
940
+ feedback.append("Nhịp điệu cần đều hơn - chú ý đến trọng âm của từ")
941
+ elif rhythm_score >= 0.8:
942
+ feedback.append("Nhịp điệu rất tốt")
943
+
944
+ if stress_score < 0.5:
945
+ feedback.append("Cần nhấn mạnh trọng âm rõ ràng hơn")
946
+ elif stress_score >= 0.8:
947
+ feedback.append("Trọng âm được nhấn rất tốt")
948
+
949
+ return feedback
950
 
951
+ def _estimate_syllables(self, text: str) -> int:
952
+ """Estimate number of syllables in text"""
953
+ vowels = "aeiouy"
954
+ text = text.lower()
955
+ syllable_count = 0
956
+ prev_was_vowel = False
957
+
958
+ for char in text:
959
+ if char in vowels:
960
+ if not prev_was_vowel:
961
+ syllable_count += 1
962
+ prev_was_vowel = True
963
+ else:
964
+ prev_was_vowel = False
965
+
966
+ if text.endswith('e'):
967
+ syllable_count -= 1
968
+
969
+ return max(1, syllable_count)
970
 
971
+ def _empty_prosody_result(self) -> Dict:
972
+ """Return empty prosody result for error cases"""
973
+ return {
974
+ "pace_score": 0.5,
975
+ "intonation_score": 0.5,
976
+ "rhythm_score": 0.5,
977
+ "stress_score": 0.5,
978
+ "overall_prosody": 0.5,
979
+ "details": {},
980
+ "feedback": ["Không thể phân tích ngữ điệu"]
981
+ }
982
 
 
983
 
984
+ class EnhancedFeedbackGenerator:
985
+ """Enhanced feedback generator with detailed analysis"""
986
 
987
+ def generate_enhanced_feedback(self, overall_score: float, wrong_words: List[Dict],
988
+ phoneme_comparisons: List[Dict], mode: AssessmentMode,
989
+ prosody_analysis: Dict = None) -> List[str]:
990
+ """Generate comprehensive feedback based on assessment mode"""
991
+
992
+ feedback = []
993
+
994
+ # Overall score feedback
995
+ if overall_score >= 0.9:
996
+ feedback.append("Phát âm xuất sắc! Bạn đã làm rất tốt.")
997
+ elif overall_score >= 0.8:
998
+ feedback.append("Phát âm rất tốt! Chỉ còn một vài điểm nhỏ cần cải thiện.")
999
+ elif overall_score >= 0.6:
1000
+ feedback.append("Phát âm khá tốt, còn một số điểm cần luyện tập thêm.")
1001
+ elif overall_score >= 0.4:
1002
+ feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu.")
1003
+ else:
1004
+ feedback.append("Hãy luyện tập chậm rãi và rõ ràng hơn.")
1005
 
1006
+ # Mode-specific feedback
1007
+ if mode == AssessmentMode.WORD:
1008
+ feedback.extend(self._generate_word_mode_feedback(wrong_words, phoneme_comparisons))
1009
+ elif mode == AssessmentMode.SENTENCE:
1010
+ feedback.extend(self._generate_sentence_mode_feedback(wrong_words, prosody_analysis))
1011
 
1012
+ # Common error patterns
1013
+ error_patterns = self._analyze_error_patterns(phoneme_comparisons)
1014
+ if error_patterns:
1015
+ feedback.extend(error_patterns)
 
1016
 
1017
+ return feedback
 
 
 
1018
 
1019
+ def _generate_word_mode_feedback(self, wrong_words: List[Dict],
1020
+ phoneme_comparisons: List[Dict]) -> List[str]:
1021
+ """Generate feedback specific to word mode"""
1022
+ feedback = []
1023
 
1024
+ if wrong_words:
1025
+ if len(wrong_words) == 1:
1026
+ word = wrong_words[0]["word"]
1027
+ feedback.append(f"Từ '{word}' cần luyện tập thêm")
1028
+
1029
+ # Character-level feedback
1030
+ char_errors = wrong_words[0].get("character_errors", [])
1031
+ if char_errors:
1032
+ error_chars = [err.character for err in char_errors[:3]]
1033
+ feedback.append(f"Chú ý các âm: {', '.join(error_chars)}")
1034
+ else:
1035
+ word_list = [w["word"] for w in wrong_words[:3]]
1036
+ feedback.append(f"Các từ cần luyện: {', '.join(word_list)}")
1037
 
1038
+ return feedback
 
 
 
 
 
 
 
 
 
1039
 
1040
+ def _generate_sentence_mode_feedback(self, wrong_words: List[Dict],
1041
+ prosody_analysis: Dict) -> List[str]:
1042
+ """Generate feedback specific to sentence mode"""
1043
+ feedback = []
1044
 
1045
+ # Word-level feedback
1046
+ if wrong_words:
1047
+ if len(wrong_words) <= 2:
1048
+ word_list = [w["word"] for w in wrong_words]
1049
+ feedback.append(f"Cần cải thiện: {', '.join(word_list)}")
1050
+ else:
1051
+ feedback.append(f" {len(wrong_words)} từ cần luyện tập")
1052
+
1053
+ # Prosody feedback
1054
+ if prosody_analysis and "feedback" in prosody_analysis:
1055
+ feedback.extend(prosody_analysis["feedback"][:2]) # Limit prosody feedback
 
 
1056
 
1057
+ return feedback
 
 
 
 
 
 
 
 
1058
 
1059
+ def _analyze_error_patterns(self, phoneme_comparisons: List[Dict]) -> List[str]:
1060
+ """Analyze common error patterns across phonemes"""
1061
+ feedback = []
1062
+
1063
+ # Count error types
1064
+ error_counts = defaultdict(int)
1065
+ difficult_phonemes = defaultdict(int)
1066
+
1067
+ for comparison in phoneme_comparisons:
1068
+ if comparison["status"] in ["wrong", "substitution"]:
1069
+ phoneme = comparison["reference_phoneme"]
1070
+ difficult_phonemes[phoneme] += 1
1071
+ error_counts[comparison["status"]] += 1
1072
+
1073
+ # Most problematic phoneme
1074
+ if difficult_phonemes:
1075
+ most_difficult = max(difficult_phonemes.items(), key=lambda x: x[1])
1076
+ if most_difficult[1] >= 2:
1077
+ phoneme = most_difficult[0]
1078
+ phoneme_tips = {
1079
+ "θ": "Lưỡi giữa răng, thổi nhẹ",
1080
+ "ð": "Lưỡi giữa răng, rung dây thanh",
1081
+ "v": "Môi dưới chạm răng trên",
1082
+ "r": "Cuộn lưỡi nhẹ",
1083
+ "z": "Như 's' nhưng rung dây thanh"
1084
+ }
1085
+
1086
+ if phoneme in phoneme_tips:
1087
+ feedback.append(f"Âm khó nhất /{phoneme}/: {phoneme_tips[phoneme]}")
1088
+
1089
+ return feedback
1090
 
 
 
 
 
1091
 
1092
+ class ProductionPronunciationAssessor:
1093
+ """Production-ready pronunciation assessor - Enhanced version with singleton pattern"""
1094
+
1095
+ _instance = None
1096
+ _initialized = False
1097
 
1098
+ def __new__(cls, onnx: bool = False, quantized: bool = False):
1099
+ if cls._instance is None:
1100
+ cls._instance = super(ProductionPronunciationAssessor, cls).__new__(cls)
1101
+ return cls._instance
1102
 
1103
+ def __init__(self, onnx: bool = False, quantized: bool = False):
1104
+ """Initialize the production-ready pronunciation assessment system (only once)"""
1105
+ if self._initialized:
1106
+ return
1107
+
1108
+ logger.info("Initializing Production Pronunciation Assessment System...")
1109
+
1110
+ self.asr = EnhancedWav2Vec2CharacterASR(onnx=onnx, quantized=quantized)
1111
+ self.word_analyzer = EnhancedWordAnalyzer()
1112
+ self.prosody_analyzer = EnhancedProsodyAnalyzer()
1113
+ self.feedback_generator = EnhancedFeedbackGenerator()
1114
+ self.g2p = EnhancedG2P()
1115
+
1116
+ ProductionPronunciationAssessor._initialized = True
1117
+ logger.info("Production system initialization completed")
1118
 
1119
+ def assess_pronunciation(self, audio_path: str, reference_text: str,
1120
+ mode: str = "auto") -> Dict:
 
 
 
 
 
 
 
 
 
 
 
1121
  """
1122
+ Main assessment function with enhanced features
1123
+
1124
  Args:
1125
  audio_path: Path to audio file
1126
+ reference_text: Reference text to compare against
1127
+ mode: Assessment mode ("word", "sentence", "auto", or legacy modes)
1128
+
1129
  Returns:
1130
+ Enhanced assessment results with backward compatibility
1131
  """
 
1132
 
1133
+ logger.info(f"Starting production assessment in {mode} mode...")
1134
+ start_time = time.time()
 
 
 
1135
 
1136
+ try:
1137
+ # Normalize and validate mode
1138
+ assessment_mode = self._normalize_mode(mode, reference_text)
1139
+ logger.info(f"Using assessment mode: {assessment_mode.value}")
1140
+
1141
+ # Step 1: Enhanced ASR transcription with features
1142
+ asr_result = self.asr.transcribe_with_features(audio_path)
1143
+
1144
+ if not asr_result["character_transcript"]:
1145
+ return self._create_error_result("No speech detected in audio")
1146
+
1147
+ # Step 2: Enhanced word analysis
1148
+ analysis_result = self.word_analyzer.analyze_words_enhanced(
1149
+ reference_text,
1150
+ asr_result["phoneme_representation"],
1151
+ assessment_mode
1152
+ )
1153
+
1154
+ # Step 3: Calculate overall score
1155
+ overall_score = self._calculate_overall_score(analysis_result["phoneme_differences"])
1156
+
1157
+ # Step 4: Prosody analysis for sentence mode
1158
+ prosody_analysis = {}
1159
+ if assessment_mode == AssessmentMode.SENTENCE:
1160
+ prosody_analysis = self.prosody_analyzer.analyze_prosody_enhanced(
1161
+ asr_result["audio_features"],
1162
+ reference_text
1163
+ )
1164
+
1165
+ # Step 5: Generate enhanced feedback
1166
+ feedback = self.feedback_generator.generate_enhanced_feedback(
1167
+ overall_score,
1168
+ analysis_result["wrong_words"],
1169
+ analysis_result["phoneme_differences"],
1170
+ assessment_mode,
1171
+ prosody_analysis
1172
+ )
1173
+
1174
+ # Step 6: Create phoneme comparison summary
1175
+ phoneme_comparison_summary = self._create_phoneme_comparison_summary(
1176
+ analysis_result["phoneme_pairs"]
1177
+ )
1178
+
1179
+ # Step 7: Assemble result with backward compatibility
1180
+ result = self._create_enhanced_result(
1181
+ asr_result, analysis_result, overall_score, feedback,
1182
+ prosody_analysis, phoneme_comparison_summary, assessment_mode
1183
+ )
1184
+
1185
+ # Add processing metadata
1186
+ processing_time = time.time() - start_time
1187
+ result["processing_info"] = {
1188
+ "processing_time": round(processing_time, 2),
1189
+ "mode": assessment_mode.value,
1190
+ "model_used": "Wav2Vec2-Enhanced",
1191
+ "onnx_enabled": self.asr.use_onnx,
1192
+ "confidence": asr_result["confidence"],
1193
+ "enhanced_features": True,
1194
+ "character_level_analysis": assessment_mode == AssessmentMode.WORD,
1195
+ "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE
1196
+ }
1197
+
1198
+ logger.info(f"Production assessment completed in {processing_time:.2f}s")
1199
+ return result
1200
+
1201
+ except Exception as e:
1202
+ logger.error(f"Production assessment error: {e}")
1203
+ return self._create_error_result(f"Assessment failed: {str(e)}")
1204
 
1205
+ def _normalize_mode(self, mode: str, reference_text: str) -> AssessmentMode:
1206
+ """Normalize mode parameter with backward compatibility"""
 
 
1207
 
1208
+ # Legacy mode mapping
1209
+ legacy_mapping = {
1210
+ "normal": AssessmentMode.AUTO,
1211
+ "advanced": AssessmentMode.AUTO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1213
 
1214
+ if mode in legacy_mapping:
1215
+ normalized_mode = legacy_mapping[mode]
1216
+ logger.info(f"Mapped legacy mode '{mode}' to '{normalized_mode.value}'")
1217
+ mode = normalized_mode.value
1218
 
1219
+ # Validate mode
1220
+ try:
1221
+ assessment_mode = AssessmentMode(mode)
1222
+ except ValueError:
1223
+ logger.warning(f"Invalid mode '{mode}', defaulting to AUTO")
1224
+ assessment_mode = AssessmentMode.AUTO
1225
 
1226
+ # Auto-detect mode based on text length
1227
+ if assessment_mode == AssessmentMode.AUTO:
1228
+ word_count = len(reference_text.strip().split())
1229
+ assessment_mode = AssessmentMode.WORD if word_count <= 3 else AssessmentMode.SENTENCE
1230
+ logger.info(f"Auto-detected mode: {assessment_mode.value} (word count: {word_count})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1231
 
1232
+ return assessment_mode
1233
 
1234
+ def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
1235
+ """Calculate weighted overall score"""
1236
+ if not phoneme_comparisons:
1237
+ return 0.0
1238
 
1239
+ total_weighted_score = 0.0
1240
+ total_weight = 0.0
 
1241
 
1242
+ for comparison in phoneme_comparisons:
1243
+ weight = comparison.get("difficulty", 0.5) # Use difficulty as weight
1244
+ score = comparison["score"]
1245
+
1246
+ total_weighted_score += score * weight
1247
+ total_weight += weight
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1248
 
1249
+ return total_weighted_score / total_weight if total_weight > 0 else 0.0
1250
 
1251
  def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
1252
+ """Create phoneme comparison summary statistics"""
1253
  total = len(phoneme_pairs)
1254
+ if total == 0:
1255
+ return {"total_phonemes": 0, "accuracy_percentage": 0}
1256
+
1257
  correct = sum(1 for pair in phoneme_pairs if pair["match"])
1258
  substitutions = sum(1 for pair in phoneme_pairs if pair["type"] == "substitution")
1259
  deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
 
1265
  "substitutions": substitutions,
1266
  "deletions": deletions,
1267
  "insertions": insertions,
1268
+ "accuracy_percentage": round((correct / total) * 100, 1),
1269
+ "error_rate": round(((substitutions + deletions + insertions) / total) * 100, 1)
1270
  }
1271
 
1272
+ def _create_enhanced_result(self, asr_result: Dict, analysis_result: Dict,
1273
+ overall_score: float, feedback: List[str],
1274
+ prosody_analysis: Dict, phoneme_summary: Dict,
1275
+ assessment_mode: AssessmentMode) -> Dict:
1276
+ """Create enhanced result with backward compatibility"""
1277
+
1278
+ # Base result structure (backward compatible)
1279
+ result = {
1280
+ "transcript": asr_result["character_transcript"],
1281
+ "transcript_phonemes": asr_result["phoneme_representation"],
1282
+ "user_phonemes": asr_result["phoneme_representation"],
1283
+ "character_transcript": asr_result["character_transcript"],
1284
+ "overall_score": overall_score,
1285
+ "word_highlights": analysis_result["word_highlights"],
1286
+ "phoneme_differences": analysis_result["phoneme_differences"],
1287
+ "wrong_words": analysis_result["wrong_words"],
1288
+ "feedback": feedback,
1289
+ }
1290
+
1291
+ # Enhanced features
1292
+ result.update({
1293
+ "reference_phonemes": analysis_result["reference_phonemes"],
1294
+ "phoneme_pairs": analysis_result["phoneme_pairs"],
1295
+ "phoneme_comparison": phoneme_summary,
1296
+ "assessment_mode": assessment_mode.value,
1297
+ })
1298
+
1299
+ # Add prosody analysis for sentence mode
1300
+ if prosody_analysis:
1301
+ result["prosody_analysis"] = prosody_analysis
1302
+
1303
+ # Add character-level analysis for word mode
1304
+ if assessment_mode == AssessmentMode.WORD:
1305
+ result["character_level_analysis"] = True
1306
 
1307
+ # Add character errors to word highlights if available
1308
+ for word_highlight in result["word_highlights"]:
1309
+ if "character_errors" in word_highlight:
1310
+ # Convert CharacterError objects to dicts for JSON serialization
1311
+ char_errors = []
1312
+ for error in word_highlight["character_errors"]:
1313
+ if isinstance(error, CharacterError):
1314
+ char_errors.append({
1315
+ "character": error.character,
1316
+ "position": error.position,
1317
+ "error_type": error.error_type,
1318
+ "expected_sound": error.expected_sound,
1319
+ "actual_sound": error.actual_sound,
1320
+ "severity": error.severity,
1321
+ "color": error.color
1322
+ })
1323
+ else:
1324
+ char_errors.append(error)
1325
+ word_highlight["character_errors"] = char_errors
1326
+
1327
+ return result
1328
+
1329
+ def _create_error_result(self, error_message: str) -> Dict:
1330
+ """Create error result structure"""
1331
+ return {
1332
+ "transcript": "",
1333
+ "transcript_phonemes": "",
1334
+ "user_phonemes": "",
1335
+ "character_transcript": "",
1336
+ "overall_score": 0.0,
1337
+ "word_highlights": [],
1338
+ "phoneme_differences": [],
1339
+ "wrong_words": [],
1340
+ "feedback": [f"Lỗi: {error_message}"],
1341
+ "error": error_message,
1342
+ "assessment_mode": "error",
1343
+ "processing_info": {
1344
+ "processing_time": 0,
1345
+ "mode": "error",
1346
+ "model_used": "Wav2Vec2-Enhanced",
1347
+ "confidence": 0.0,
1348
+ "enhanced_features": False
1349
  }
1350
+ }
1351
+
1352
+ def get_system_info(self) -> Dict:
1353
+ """Get comprehensive system information"""
1354
+ return {
1355
+ "version": "2.1.0-production",
1356
+ "name": "Production Pronunciation Assessment System",
1357
+ "modes": [mode.value for mode in AssessmentMode],
1358
+ "features": [
1359
+ "Enhanced Levenshtein distance phoneme alignment",
1360
+ "Character-level error detection (word mode)",
1361
+ "Advanced prosody analysis (sentence mode)",
1362
+ "Vietnamese speaker-specific error patterns",
1363
+ "Real-time confidence scoring",
1364
+ "IPA phonetic representation with visualization",
1365
+ "Backward compatibility with legacy APIs",
1366
+ "Production-ready error handling"
1367
+ ],
1368
+ "model_info": {
1369
+ "asr_model": self.asr.model_name,
1370
+ "onnx_enabled": self.asr.use_onnx,
1371
+ "sample_rate": self.asr.sample_rate
1372
+ },
1373
+ "assessment_modes": {
1374
+ "word": "Detailed character and phoneme level analysis for single words or short phrases",
1375
+ "sentence": "Word-level analysis with prosody evaluation for complete sentences",
1376
+ "auto": "Automatically selects mode based on text length (≤3 words = word mode)"
1377
  }
1378
+ }
1379
+
1380
+
1381
+ # Backward compatibility wrapper
1382
+ class SimplePronunciationAssessor:
1383
+ """Backward compatible wrapper for the enhanced system"""
1384
+
1385
+ def __init__(self):
1386
+ print("Initializing Simple Pronunciation Assessor (Enhanced)...")
1387
+ self.enhanced_assessor = ProductionPronunciationAssessor()
1388
+ print("Enhanced Simple Pronunciation Assessor initialization completed")
1389
+
1390
+ def assess_pronunciation(self, audio_path: str, reference_text: str,
1391
+ mode: str = "normal") -> Dict:
1392
+ """
1393
+ Backward compatible assessment function
1394
+
1395
+ Args:
1396
+ audio_path: Path to audio file
1397
+ reference_text: Reference text to compare
1398
+ mode: Assessment mode (supports legacy modes)
1399
+ """
1400
+ return self.enhanced_assessor.assess_pronunciation(audio_path, reference_text, mode)
1401
+
1402
+
1403
+ # Example usage
1404
+ if __name__ == "__main__":
1405
+ # Initialize production system
1406
+ system = ProductionPronunciationAssessor(onnx=False, quantized=False)
1407
+
1408
+ # Example word mode assessment
1409
+ print("=== WORD MODE EXAMPLE ===")
1410
+ word_result = system.assess_pronunciation(
1411
+ audio_path="./hello_world.wav",
1412
+ reference_text="hello",
1413
+ mode="word"
1414
+ )
1415
+ # print(f"Word mode result keys: {list(word_result.keys())}")
1416
+ print("Word result", word_result)
1417
+
1418
+ # Example sentence mode assessment
1419
+ print("\n=== SENTENCE MODE EXAMPLE ===")
1420
+ sentence_result = system.assess_pronunciation(
1421
+ audio_path="./hello_how_are_you_today.wav",
1422
+ reference_text="Hello, how are you today?",
1423
+ mode="sentence"
1424
+ )
1425
+ print(f"Sentence mode result keys: {list(sentence_result.keys())}")
1426
+ print("Sentence result", sentence_result)
1427
+
1428
+ # Example auto mode assessment
1429
+ print("\n=== AUTO MODE EXAMPLE ===")
1430
+ auto_result = system.assess_pronunciation(
1431
+ audio_path="./hello_how_are_you_today.wav",
1432
+ reference_text="world", # Single word - should auto-select word mode
1433
+ mode="auto"
1434
+ )
1435
+ print(f"Auto mode result: {auto_result['assessment_mode']}")
1436
+ print("Auto result", auto_result)
1437
+
1438
+ # Backward compatibility test
1439
+ print("\n=== BACKWARD COMPATIBILITY TEST ===")
1440
+ legacy_assessor = SimplePronunciationAssessor()
1441
+ legacy_result = legacy_assessor.assess_pronunciation(
1442
+ audio_path="./hello_world.wav",
1443
+ reference_text="pronunciation",
1444
+ mode="normal" # Legacy mode
1445
+ )
1446
+ print(f"Legacy mode result: {legacy_result}")
1447
+ print(f"Legacy mode mapped to: {legacy_result.get('assessment_mode', 'N/A')}")
1448
+
1449
+ # System info
1450
+ print(f"\n=== SYSTEM INFO ===")
1451
+ system_info = system.get_system_info()
1452
+ print(f"System version: {system_info['version']}")
1453
+ print(f"Available modes: {system_info['modes']}")
1454
+ print(f"Key features: {len(system_info['features'])} enhanced features")
src/apis/create_app.py CHANGED
@@ -6,6 +6,8 @@ from src.apis.routes.lesson_route import router as router_lesson
6
  from src.apis.routes.evaluation_route import router as router_evaluation
7
  from src.apis.routes.pronunciation_route import router as router_pronunciation
8
  from src.apis.routes.speaking_route import router as router_speaking
 
 
9
 
10
  api_router = APIRouter(prefix="/api")
11
  api_router.include_router(router_user)
@@ -14,6 +16,7 @@ api_router.include_router(router_lesson)
14
  api_router.include_router(router_evaluation)
15
  api_router.include_router(router_pronunciation)
16
  api_router.include_router(router_speaking)
 
17
 
18
 
19
  def create_app():
@@ -27,4 +30,19 @@ def create_app():
27
  allow_headers=["*"],
28
  )
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return app
 
6
  from src.apis.routes.evaluation_route import router as router_evaluation
7
  from src.apis.routes.pronunciation_route import router as router_pronunciation
8
  from src.apis.routes.speaking_route import router as router_speaking
9
+ from src.apis.routes.ipa_route import router as router_ipa
10
+ from loguru import logger
11
 
12
  api_router = APIRouter(prefix="/api")
13
  api_router.include_router(router_user)
 
16
  api_router.include_router(router_evaluation)
17
  api_router.include_router(router_pronunciation)
18
  api_router.include_router(router_speaking)
19
+ api_router.include_router(router_ipa)
20
 
21
 
22
  def create_app():
 
30
  allow_headers=["*"],
31
  )
32
 
33
+ @app.on_event("startup")
34
+ async def startup_event():
35
+ """Pre-initialize assessor on server startup for better performance"""
36
+ try:
37
+ logger.info("Pre-initializing ProductionPronunciationAssessor...")
38
+ from src.apis.routes.speaking_route import get_assessor
39
+ from src.apis.routes.ipa_route import get_assessor as get_ipa_assessor
40
+
41
+ # Pre-initialize both assessors (they share the same singleton)
42
+ get_assessor()
43
+ get_ipa_assessor()
44
+ logger.info("ProductionPronunciationAssessor pre-initialization completed!")
45
+ except Exception as e:
46
+ logger.error(f"Failed to pre-initialize assessor: {e}")
47
+
48
  return app
src/apis/routes/ipa_route.py ADDED
@@ -0,0 +1,1763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Query, UploadFile, File, Form
2
+ from pydantic import BaseModel
3
+ from typing import List, Dict, Optional, Union, Any
4
+ import json
5
+ import random
6
+ import re
7
+ import tempfile
8
+ import os
9
+ import base64
10
+ import subprocess
11
+ from loguru import logger
12
+ from src.apis.controllers.speaking_controller import (
13
+ EnhancedG2P,
14
+ ProductionPronunciationAssessor,
15
+ )
16
+
17
+
18
+ class CharacterMapping(BaseModel):
19
+ ipa_symbol: Optional[str] = None
20
+ grapheme: Optional[str] = None
21
+ start_index: Optional[int] = None
22
+ end_index: Optional[int] = None
23
+ characters: Optional[str] = None
24
+ chars: Optional[str] = None
25
+ ipa: Optional[str] = None
26
+ start: Optional[int] = None
27
+ end: Optional[int] = None
28
+
29
+
30
+ router = APIRouter(prefix="/ipa", tags=["IPA Training"])
31
+
32
+ # Initialize G2P converter and assessment system once (singleton pattern)
33
+ g2p = EnhancedG2P()
34
+ # Global assessor instance - will be initialized once due to singleton pattern
35
+ global_assessor = None
36
+
37
+
38
+ def get_assessor():
39
+ """Get or create the global assessor instance"""
40
+ global global_assessor
41
+ if global_assessor is None:
42
+ logger.info("Creating global ProductionPronunciationAssessor instance...")
43
+ global_assessor = ProductionPronunciationAssessor()
44
+ return global_assessor
45
+
46
+
47
+ def map_ipa_to_characters(word: str, ipa_symbol: str) -> List[CharacterMapping]:
48
+ """
49
+ Map IPA symbols to their corresponding characters in the word
50
+ Returns a list of character mappings for highlighting
51
+ """
52
+ # Common IPA to grapheme mappings
53
+ ipa_mappings = {
54
+ # Vowels
55
+ "i": [
56
+ "ee",
57
+ "ea",
58
+ "e",
59
+ "ie",
60
+ "ei",
61
+ "i",
62
+ ], # see, eat, me, piece, receive, machine
63
+ "ɪ": ["i", "y", "ui", "e"], # sit, gym, build, women
64
+ "u": ["oo", "u", "ou", "ue", "ui", "o"], # food, flu, soup, true, fruit, do
65
+ "ʊ": ["oo", "u", "ou"], # book, put, could
66
+ "ɛ": ["e", "ea", "ai", "a"], # bed, head, said, many
67
+ "ə": [
68
+ "a",
69
+ "e",
70
+ "i",
71
+ "o",
72
+ "u",
73
+ "ou",
74
+ "ar",
75
+ "er",
76
+ "or",
77
+ ], # about, taken, pencil, lemon, circus, famous, dollar, butter, doctor
78
+ "ʌ": ["u", "o", "ou", "oo"], # cup, love, country, blood
79
+ "ɑ": ["a", "o", "au"], # father, hot, aunt
80
+ "æ": ["a"], # cat, apple
81
+ "ɔ": ["o", "aw", "au", "a", "ou"], # saw, law, caught, all, thought
82
+ # Diphthongs
83
+ "eɪ": ["a", "ai", "ay", "ei", "ey", "ea"], # say, wait, day, eight, grey, break
84
+ "aɪ": ["i", "y", "ie", "uy", "ai", "igh"], # my, fly, pie, buy, aisle, night
85
+ "ɔɪ": ["oy", "oi"], # boy, coin
86
+ "aʊ": ["ou", "ow"], # how, house
87
+ "oʊ": ["o", "oa", "ow", "oe", "ou"], # go, boat, show, toe, soul
88
+ # Consonants
89
+ "p": ["p", "pp"], # pen, apple
90
+ "b": ["b", "bb"], # boy, rabbit
91
+ "t": ["t", "tt", "ed"], # top, butter, walked
92
+ "d": ["d", "dd", "ed"], # dog, ladder, played
93
+ "k": ["c", "k", "ck", "ch", "qu"], # cat, key, back, school, queen
94
+ "g": ["g", "gg", "gh", "gu"], # go, egg, ghost, guard
95
+ "f": ["f", "ff", "ph", "gh"], # fish, off, phone, laugh
96
+ "v": ["v", "ve"], # very, have
97
+ "θ": ["th"], # think
98
+ "ð": ["th"], # this
99
+ "s": ["s", "ss", "c", "sc", "ps"], # see, miss, city, scene, psychology
100
+ "z": ["z", "zz", "s", "se", "ze"], # zoo, buzz, is, rose, froze
101
+ "ʃ": [
102
+ "sh",
103
+ "s",
104
+ "ss",
105
+ "ch",
106
+ "ci",
107
+ "ti",
108
+ ], # ship, sure, mission, machine, special, nation
109
+ "ʒ": ["s", "si", "ge"], # measure, vision, garage
110
+ "tʃ": ["ch", "tch", "t"], # chair, watch, nature
111
+ "dʒ": ["j", "ge", "dge", "g"], # job, age, bridge, gym
112
+ "m": ["m", "mm", "mb"], # man, hammer, lamb
113
+ "n": ["n", "nn", "kn", "gn"], # no, dinner, knee, sign
114
+ "ŋ": ["ng", "n"], # sing, think
115
+ "l": ["l", "ll"], # love, hello
116
+ "r": ["r", "rr", "wr"], # red, sorry, write
117
+ "j": ["y", "i", "j"], # yes, onion, hallelujah
118
+ "w": ["w", "wh", "qu", "u"], # we, what, queen, language
119
+ "h": ["h", "wh"], # house, who
120
+ }
121
+
122
+ # Get possible grapheme representations for the IPA symbol
123
+ possible_graphemes = ipa_mappings.get(ipa_symbol, [])
124
+
125
+ # Find the best match in the word
126
+ word_lower = word.lower()
127
+ mappings = []
128
+
129
+ for grapheme in possible_graphemes:
130
+ start_pos = word_lower.find(grapheme)
131
+ if start_pos != -1:
132
+ mappings.append(
133
+ CharacterMapping(
134
+ ipa_symbol=ipa_symbol,
135
+ grapheme=grapheme,
136
+ start_index=start_pos,
137
+ end_index=start_pos + len(grapheme),
138
+ characters=word[start_pos : start_pos + len(grapheme)],
139
+ )
140
+ )
141
+ break # Use the first match found
142
+
143
+ # If no direct match found, try to match individual characters
144
+ if not mappings and ipa_symbol in word_lower:
145
+ start_pos = word_lower.find(ipa_symbol)
146
+ if start_pos != -1:
147
+ mappings.append(
148
+ CharacterMapping(
149
+ ipa_symbol=ipa_symbol,
150
+ grapheme=ipa_symbol,
151
+ start_index=start_pos,
152
+ end_index=start_pos + len(ipa_symbol),
153
+ characters=word[start_pos : start_pos + len(ipa_symbol)],
154
+ )
155
+ )
156
+
157
+ return mappings
158
+
159
+
160
+ def map_word_to_phonemes(word: str, ipa_transcription: str) -> List[CharacterMapping]:
161
+ """
162
+ Map an entire word to its phoneme sequence
163
+ Returns detailed character to IPA mappings for the whole word
164
+ """
165
+ # Clean the IPA transcription
166
+ clean_ipa = ipa_transcription.strip("/").replace("ˈ", "").replace("ˌ", "")
167
+
168
+ # Common word-to-IPA mappings for better accuracy
169
+ word_mappings = {
170
+ # Easy words
171
+ "cat": [
172
+ CharacterMapping(
173
+ characters="c", ipa_symbol="k", start_index=0, end_index=1
174
+ ),
175
+ CharacterMapping(
176
+ characters="a", ipa_symbol="æ", start_index=1, end_index=2
177
+ ),
178
+ CharacterMapping(
179
+ characters="t", ipa_symbol="t", start_index=2, end_index=3
180
+ ),
181
+ ],
182
+ "dog": [
183
+ CharacterMapping(
184
+ characters="d", ipa_symbol="d", start_index=0, end_index=1
185
+ ),
186
+ CharacterMapping(
187
+ characters="o", ipa_symbol="ɔ", start_index=1, end_index=2
188
+ ),
189
+ CharacterMapping(
190
+ characters="g", ipa_symbol="g", start_index=2, end_index=3
191
+ ),
192
+ ],
193
+ "pen": [
194
+ CharacterMapping(
195
+ characters="p", ipa_symbol="p", start_index=0, end_index=1
196
+ ),
197
+ CharacterMapping(
198
+ characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
199
+ ),
200
+ CharacterMapping(
201
+ characters="n", ipa_symbol="n", start_index=2, end_index=3
202
+ ),
203
+ ],
204
+ "see": [
205
+ CharacterMapping(
206
+ characters="s", ipa_symbol="s", start_index=0, end_index=1
207
+ ),
208
+ CharacterMapping(
209
+ characters="ee", ipa_symbol="i", start_index=1, end_index=3
210
+ ),
211
+ ],
212
+ "bed": [
213
+ CharacterMapping(
214
+ characters="b", ipa_symbol="b", start_index=0, end_index=1
215
+ ),
216
+ CharacterMapping(
217
+ characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
218
+ ),
219
+ CharacterMapping(
220
+ characters="d", ipa_symbol="d", start_index=2, end_index=3
221
+ ),
222
+ ],
223
+ "fish": [
224
+ CharacterMapping(
225
+ characters="f", ipa_symbol="f", start_index=0, end_index=1
226
+ ),
227
+ CharacterMapping(
228
+ characters="i", ipa_symbol="ɪ", start_index=1, end_index=2
229
+ ),
230
+ CharacterMapping(
231
+ characters="sh", ipa_symbol="ʃ", start_index=2, end_index=4
232
+ ),
233
+ ],
234
+ "book": [
235
+ CharacterMapping(
236
+ characters="b", ipa_symbol="b", start_index=0, end_index=1
237
+ ),
238
+ CharacterMapping(
239
+ characters="oo", ipa_symbol="ʊ", start_index=1, end_index=3
240
+ ),
241
+ CharacterMapping(
242
+ characters="k", ipa_symbol="k", start_index=3, end_index=4
243
+ ),
244
+ ],
245
+ "food": [
246
+ CharacterMapping(
247
+ characters="f", ipa_symbol="f", start_index=0, end_index=1
248
+ ),
249
+ CharacterMapping(
250
+ characters="oo", ipa_symbol="u", start_index=1, end_index=3
251
+ ),
252
+ CharacterMapping(
253
+ characters="d", ipa_symbol="d", start_index=3, end_index=4
254
+ ),
255
+ ],
256
+ "man": [
257
+ CharacterMapping(
258
+ characters="m", ipa_symbol="m", start_index=0, end_index=1
259
+ ),
260
+ CharacterMapping(
261
+ characters="a", ipa_symbol="æ", start_index=1, end_index=2
262
+ ),
263
+ CharacterMapping(
264
+ characters="n", ipa_symbol="n", start_index=2, end_index=3
265
+ ),
266
+ ],
267
+ "sun": [
268
+ CharacterMapping(
269
+ characters="s", ipa_symbol="s", start_index=0, end_index=1
270
+ ),
271
+ CharacterMapping(
272
+ characters="u", ipa_symbol="ʌ", start_index=1, end_index=2
273
+ ),
274
+ CharacterMapping(
275
+ characters="n", ipa_symbol="n", start_index=2, end_index=3
276
+ ),
277
+ ],
278
+ # Medium words
279
+ "chair": [
280
+ CharacterMapping(
281
+ characters="ch", ipa_symbol="tʃ", start_index=0, end_index=2
282
+ ),
283
+ CharacterMapping(
284
+ characters="ai", ipa_symbol="ɛ", start_index=2, end_index=4
285
+ ),
286
+ CharacterMapping(
287
+ characters="r", ipa_symbol="r", start_index=4, end_index=5
288
+ ),
289
+ ],
290
+ "water": [
291
+ CharacterMapping(
292
+ characters="w", ipa_symbol="w", start_index=0, end_index=1
293
+ ),
294
+ CharacterMapping(
295
+ characters="a", ipa_symbol="ɔ", start_index=1, end_index=2
296
+ ),
297
+ CharacterMapping(
298
+ characters="t", ipa_symbol="t", start_index=2, end_index=3
299
+ ),
300
+ CharacterMapping(
301
+ characters="er", ipa_symbol="ər", start_index=3, end_index=5
302
+ ),
303
+ ],
304
+ "house": [
305
+ CharacterMapping(
306
+ characters="h", ipa_symbol="h", start_index=0, end_index=1
307
+ ),
308
+ CharacterMapping(
309
+ characters="ou", ipa_symbol="aʊ", start_index=1, end_index=3
310
+ ),
311
+ CharacterMapping(
312
+ characters="se", ipa_symbol="s", start_index=3, end_index=5
313
+ ),
314
+ ],
315
+ "yellow": [
316
+ CharacterMapping(
317
+ characters="y", ipa_symbol="j", start_index=0, end_index=1
318
+ ),
319
+ CharacterMapping(
320
+ characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
321
+ ),
322
+ CharacterMapping(
323
+ characters="ll", ipa_symbol="l", start_index=2, end_index=4
324
+ ),
325
+ CharacterMapping(
326
+ characters="ow", ipa_symbol="oʊ", start_index=4, end_index=6
327
+ ),
328
+ ],
329
+ "about": [
330
+ CharacterMapping(
331
+ characters="a", ipa_symbol="ə", start_index=0, end_index=1
332
+ ),
333
+ CharacterMapping(
334
+ characters="b", ipa_symbol="b", start_index=1, end_index=2
335
+ ),
336
+ CharacterMapping(
337
+ characters="ou", ipa_symbol="aʊ", start_index=2, end_index=4
338
+ ),
339
+ CharacterMapping(
340
+ characters="t", ipa_symbol="t", start_index=4, end_index=5
341
+ ),
342
+ ],
343
+ # Hard words
344
+ "think": [
345
+ CharacterMapping(
346
+ characters="th", ipa_symbol="θ", start_index=0, end_index=2
347
+ ),
348
+ CharacterMapping(
349
+ characters="i", ipa_symbol="ɪ", start_index=2, end_index=3
350
+ ),
351
+ CharacterMapping(
352
+ characters="nk", ipa_symbol="ŋk", start_index=3, end_index=5
353
+ ),
354
+ ],
355
+ "this": [
356
+ CharacterMapping(
357
+ characters="th", ipa_symbol="ð", start_index=0, end_index=2
358
+ ),
359
+ CharacterMapping(
360
+ characters="i", ipa_symbol="ɪ", start_index=2, end_index=3
361
+ ),
362
+ CharacterMapping(
363
+ characters="s", ipa_symbol="s", start_index=3, end_index=4
364
+ ),
365
+ ],
366
+ "very": [
367
+ CharacterMapping(
368
+ characters="v", ipa_symbol="v", start_index=0, end_index=1
369
+ ),
370
+ CharacterMapping(
371
+ characters="e", ipa_symbol="ɛ", start_index=1, end_index=2
372
+ ),
373
+ CharacterMapping(
374
+ characters="r", ipa_symbol="r", start_index=2, end_index=3
375
+ ),
376
+ CharacterMapping(
377
+ characters="y", ipa_symbol="i", start_index=3, end_index=4
378
+ ),
379
+ ],
380
+ "through": [
381
+ CharacterMapping(
382
+ characters="th", ipa_symbol="θ", start_index=0, end_index=2
383
+ ),
384
+ CharacterMapping(
385
+ characters="r", ipa_symbol="r", start_index=2, end_index=3
386
+ ),
387
+ CharacterMapping(
388
+ characters="ough", ipa_symbol="u", start_index=3, end_index=7
389
+ ),
390
+ ],
391
+ "measure": [
392
+ CharacterMapping(
393
+ characters="m", ipa_symbol="m", start_index=0, end_index=1
394
+ ),
395
+ CharacterMapping(
396
+ characters="ea", ipa_symbol="ɛ", start_index=1, end_index=3
397
+ ),
398
+ CharacterMapping(
399
+ characters="s", ipa_symbol="ʒ", start_index=3, end_index=4
400
+ ),
401
+ CharacterMapping(
402
+ characters="ure", ipa_symbol="ər", start_index=4, end_index=7
403
+ ),
404
+ ],
405
+ }
406
+
407
+ # Check if we have a predefined mapping
408
+ if word.lower() in word_mappings:
409
+ return word_mappings[word.lower()]
410
+
411
+ # If no predefined mapping, try to create a basic mapping
412
+ # This is a simplified approach - in production, you'd use a more sophisticated G2P system
413
+ mappings = []
414
+ char_index = 0
415
+
416
+ # Basic character-by-character mapping (fallback)
417
+ for i, char in enumerate(word.lower()):
418
+ if char.isalpha():
419
+ mappings.append(
420
+ CharacterMapping(
421
+ characters=word[i],
422
+ ipa_symbol=char, # Simplified - would need actual phoneme mapping
423
+ start_index=i,
424
+ end_index=i + 1,
425
+ )
426
+ )
427
+
428
+ return mappings
429
+
430
+
431
+ class IPASymbol(BaseModel):
432
+ symbol: str
433
+ description: str
434
+ example_word: str
435
+ audio_example: Optional[str] = None
436
+ category: str # vowel, consonant, diphthong
437
+ difficulty_level: str # easy, medium, hard
438
+ vietnamese_tip: str
439
+ character_mapping: Optional[List[CharacterMapping]] = None
440
+
441
+
442
+ class IPALesson(BaseModel):
443
+ id: str
444
+ title: str
445
+ description: str
446
+ symbols: List[IPASymbol]
447
+ difficulty: str
448
+ estimated_time: int # minutes
449
+
450
+
451
+ class IPAWord(BaseModel):
452
+ word: str
453
+ ipa: str
454
+ phonemes: List[str]
455
+ difficulty: str
456
+ meaning: str
457
+ example_sentence: str
458
+ character_mapping: Optional[List[CharacterMapping]] = None
459
+
460
+
461
+ class IPAExercise(BaseModel):
462
+ word: str
463
+ ipa: str
464
+ phonemes: List[str]
465
+ hints: List[str]
466
+ difficulty: str
467
+
468
+
469
+ # IPA Symbol data for Vietnamese learners
470
+ IPA_SYMBOLS_DATA = {
471
+ # Vowels - Easy
472
+ "i": {
473
+ "desc": "High front unrounded vowel",
474
+ "word": "see",
475
+ "tip": "Như âm 'i' trong tiếng Việt nhưng dài hơn",
476
+ "category": "vowel",
477
+ "difficulty": "easy",
478
+ },
479
+ "u": {
480
+ "desc": "High back rounded vowel",
481
+ "word": "food",
482
+ "tip": "Như âm 'u' trong tiếng Việt nhưng dài hơn",
483
+ "category": "vowel",
484
+ "difficulty": "easy",
485
+ },
486
+ "ɑ": {
487
+ "desc": "Low back unrounded vowel",
488
+ "word": "father",
489
+ "tip": "Mở miệng rộng, âm 'a' sâu",
490
+ "category": "vowel",
491
+ "difficulty": "easy",
492
+ },
493
+ "ɛ": {
494
+ "desc": "Mid front unrounded vowel",
495
+ "word": "bed",
496
+ "tip": "Giống âm 'e' trong 'đẹp'",
497
+ "category": "vowel",
498
+ "difficulty": "easy",
499
+ },
500
+ "ɔ": {
501
+ "desc": "Mid back rounded vowel",
502
+ "word": "saw",
503
+ "tip": "Âm 'o' tròn môi",
504
+ "category": "vowel",
505
+ "difficulty": "easy",
506
+ },
507
+ # Vowels - Medium
508
+ "ɪ": {
509
+ "desc": "Near-close near-front unrounded vowel",
510
+ "word": "sit",
511
+ "tip": "Âm 'i' ngắn, không kéo dài",
512
+ "category": "vowel",
513
+ "difficulty": "medium",
514
+ },
515
+ "ʊ": {
516
+ "desc": "Near-close near-back rounded vowel",
517
+ "word": "put",
518
+ "tip": "Âm 'u' ngắn, tròn môi nhẹ",
519
+ "category": "vowel",
520
+ "difficulty": "medium",
521
+ },
522
+ "ʌ": {
523
+ "desc": "Mid central unrounded vowel",
524
+ "word": "cup",
525
+ "tip": "Âm 'ơ' nhưng mở miệng hơn",
526
+ "category": "vowel",
527
+ "difficulty": "medium",
528
+ },
529
+ "æ": {
530
+ "desc": "Near-open front unrounded vowel",
531
+ "word": "cat",
532
+ "tip": "Mở miệng rộng, âm 'a' phẳng",
533
+ "category": "vowel",
534
+ "difficulty": "medium",
535
+ },
536
+ "ə": {
537
+ "desc": "Schwa - mid central vowel",
538
+ "word": "about",
539
+ "tip": "Âm yếu 'ơ', thư giãn cơ miệng",
540
+ "category": "vowel",
541
+ "difficulty": "medium",
542
+ },
543
+ # Diphthongs
544
+ "eɪ": {
545
+ "desc": "Diphthong from e to i",
546
+ "word": "say",
547
+ "tip": "Từ 'e' trượt lên 'i'",
548
+ "category": "diphthong",
549
+ "difficulty": "medium",
550
+ },
551
+ "aɪ": {
552
+ "desc": "Diphthong from a to i",
553
+ "word": "my",
554
+ "tip": "Từ 'a' trượt lên 'i'",
555
+ "category": "diphthong",
556
+ "difficulty": "medium",
557
+ },
558
+ "ɔɪ": {
559
+ "desc": "Diphthong from o to i",
560
+ "word": "boy",
561
+ "tip": "Từ 'o' trượt lên 'i'",
562
+ "category": "diphthong",
563
+ "difficulty": "medium",
564
+ },
565
+ "aʊ": {
566
+ "desc": "Diphthong from a to u",
567
+ "word": "how",
568
+ "tip": "Từ 'a' trượt lên 'u'",
569
+ "category": "diphthong",
570
+ "difficulty": "medium",
571
+ },
572
+ "oʊ": {
573
+ "desc": "Diphthong from o to u",
574
+ "word": "go",
575
+ "tip": "Từ 'o' trượt lên 'u'",
576
+ "category": "diphthong",
577
+ "difficulty": "medium",
578
+ },
579
+ # Consonants - Easy
580
+ "p": {
581
+ "desc": "Voiceless bilabial plosive",
582
+ "word": "pen",
583
+ "tip": "Âm 'p' không thở ra",
584
+ "category": "consonant",
585
+ "difficulty": "easy",
586
+ },
587
+ "b": {
588
+ "desc": "Voiced bilabial plosive",
589
+ "word": "boy",
590
+ "tip": "Âm 'b' có rung dây thanh",
591
+ "category": "consonant",
592
+ "difficulty": "easy",
593
+ },
594
+ "t": {
595
+ "desc": "Voiceless alveolar plosive",
596
+ "word": "top",
597
+ "tip": "Âm 't' lưỡi chạm nướu",
598
+ "category": "consonant",
599
+ "difficulty": "easy",
600
+ },
601
+ "d": {
602
+ "desc": "Voiced alveolar plosive",
603
+ "word": "dog",
604
+ "tip": "Âm 'd' có rung d��y thanh",
605
+ "category": "consonant",
606
+ "difficulty": "easy",
607
+ },
608
+ "k": {
609
+ "desc": "Voiceless velar plosive",
610
+ "word": "cat",
611
+ "tip": "Âm 'k' cuống họng",
612
+ "category": "consonant",
613
+ "difficulty": "easy",
614
+ },
615
+ "g": {
616
+ "desc": "Voiced velar plosive",
617
+ "word": "go",
618
+ "tip": "Âm 'g' có rung dây thanh",
619
+ "category": "consonant",
620
+ "difficulty": "easy",
621
+ },
622
+ "m": {
623
+ "desc": "Bilabial nasal",
624
+ "word": "man",
625
+ "tip": "Âm 'm' qua mũi",
626
+ "category": "consonant",
627
+ "difficulty": "easy",
628
+ },
629
+ "n": {
630
+ "desc": "Alveolar nasal",
631
+ "word": "no",
632
+ "tip": "Âm 'n' lưỡi chạm nướu",
633
+ "category": "consonant",
634
+ "difficulty": "easy",
635
+ },
636
+ "s": {
637
+ "desc": "Voiceless alveolar fricative",
638
+ "word": "see",
639
+ "tip": "Âm 's' rít",
640
+ "category": "consonant",
641
+ "difficulty": "easy",
642
+ },
643
+ "f": {
644
+ "desc": "Voiceless labiodental fricative",
645
+ "word": "fish",
646
+ "tip": "Môi dưới chạm răng trên",
647
+ "category": "consonant",
648
+ "difficulty": "easy",
649
+ },
650
+ # Consonants - Medium
651
+ "ʃ": {
652
+ "desc": "Voiceless postalveolar fricative",
653
+ "word": "ship",
654
+ "tip": "Âm 'sh', lưỡi cong",
655
+ "category": "consonant",
656
+ "difficulty": "medium",
657
+ },
658
+ "ʒ": {
659
+ "desc": "Voiced postalveolar fricative",
660
+ "word": "measure",
661
+ "tip": "Như 'ʃ' nhưng có rung dây thanh",
662
+ "category": "consonant",
663
+ "difficulty": "medium",
664
+ },
665
+ "tʃ": {
666
+ "desc": "Voiceless postalveolar affricate",
667
+ "word": "chair",
668
+ "tip": "Âm 'ch', từ 't' + 'ʃ'",
669
+ "category": "consonant",
670
+ "difficulty": "medium",
671
+ },
672
+ "dʒ": {
673
+ "desc": "Voiced postalveolar affricate",
674
+ "word": "job",
675
+ "tip": "Từ 'd' + 'ʒ'",
676
+ "category": "consonant",
677
+ "difficulty": "medium",
678
+ },
679
+ "l": {
680
+ "desc": "Lateral approximant",
681
+ "word": "love",
682
+ "tip": "Lưỡi chạm nướu, âm thoát hai bên",
683
+ "category": "consonant",
684
+ "difficulty": "medium",
685
+ },
686
+ "r": {
687
+ "desc": "Approximant",
688
+ "word": "red",
689
+ "tip": "Cuộn lưỡi nhẹ, không chạm vòm",
690
+ "category": "consonant",
691
+ "difficulty": "medium",
692
+ },
693
+ "j": {
694
+ "desc": "Palatal approximant",
695
+ "word": "yes",
696
+ "tip": "Âm 'y', lưỡi gần vòm miệng",
697
+ "category": "consonant",
698
+ "difficulty": "medium",
699
+ },
700
+ "w": {
701
+ "desc": "Labial-velar approximant",
702
+ "word": "we",
703
+ "tip": "Tròn môi như 'u', không dùng răng",
704
+ "category": "consonant",
705
+ "difficulty": "medium",
706
+ },
707
+ "h": {
708
+ "desc": "Glottal fricative",
709
+ "word": "house",
710
+ "tip": "Thở ra nhẹ từ họng",
711
+ "category": "consonant",
712
+ "difficulty": "medium",
713
+ },
714
+ "z": {
715
+ "desc": "Voiced alveolar fricative",
716
+ "word": "zoo",
717
+ "tip": "Như 's' nhưng có rung dây thanh",
718
+ "category": "consonant",
719
+ "difficulty": "medium",
720
+ },
721
+ # Consonants - Hard (for Vietnamese speakers)
722
+ "θ": {
723
+ "desc": "Voiceless dental fricative",
724
+ "word": "think",
725
+ "tip": "Lưỡi giữa răng, thổi nhẹ",
726
+ "category": "consonant",
727
+ "difficulty": "hard",
728
+ },
729
+ "ð": {
730
+ "desc": "Voiced dental fricative",
731
+ "word": "this",
732
+ "tip": "Lưỡi giữa răng, rung dây thanh",
733
+ "category": "consonant",
734
+ "difficulty": "hard",
735
+ },
736
+ "v": {
737
+ "desc": "Voiced labiodental fricative",
738
+ "word": "very",
739
+ "tip": "Môi dưới chạm răng trên, rung dây thanh",
740
+ "category": "consonant",
741
+ "difficulty": "hard",
742
+ },
743
+ "ŋ": {
744
+ "desc": "Velar nasal",
745
+ "word": "sing",
746
+ "tip": "Âm 'ng' cuối từ",
747
+ "category": "consonant",
748
+ "difficulty": "hard",
749
+ },
750
+ }
751
+
752
+ # Sample word database for each difficulty level
753
+ SAMPLE_WORDS = {
754
+ "easy": [
755
+ {
756
+ "word": "cat",
757
+ "ipa": "/kæt/",
758
+ "meaning": "con mèo",
759
+ "sentence": "The cat is sleeping.",
760
+ },
761
+ {
762
+ "word": "dog",
763
+ "ipa": "/dɔg/",
764
+ "meaning": "con chó",
765
+ "sentence": "I love my dog.",
766
+ },
767
+ {
768
+ "word": "man",
769
+ "ipa": "/mæn/",
770
+ "meaning": "người đàn ông",
771
+ "sentence": "The man is tall.",
772
+ },
773
+ {
774
+ "word": "pen",
775
+ "ipa": "/pɛn/",
776
+ "meaning": "cái bút",
777
+ "sentence": "I need a pen.",
778
+ },
779
+ {
780
+ "word": "sun",
781
+ "ipa": "/sʌn/",
782
+ "meaning": "mặt trời",
783
+ "sentence": "The sun is bright.",
784
+ },
785
+ {
786
+ "word": "fish",
787
+ "ipa": "/fɪʃ/",
788
+ "meaning": "con cá",
789
+ "sentence": "Fish live in water.",
790
+ },
791
+ {
792
+ "word": "book",
793
+ "ipa": "/bʊk/",
794
+ "meaning": "quyển sách",
795
+ "sentence": "I read a book.",
796
+ },
797
+ {
798
+ "word": "food",
799
+ "ipa": "/fud/",
800
+ "meaning": "thức ăn",
801
+ "sentence": "I like good food.",
802
+ },
803
+ {
804
+ "word": "see",
805
+ "ipa": "/si/",
806
+ "meaning": "nhìn thấy",
807
+ "sentence": "I can see you.",
808
+ },
809
+ {
810
+ "word": "bed",
811
+ "ipa": "/bɛd/",
812
+ "meaning": "giường",
813
+ "sentence": "I sleep in my bed.",
814
+ },
815
+ ],
816
+ "medium": [
817
+ {
818
+ "word": "water",
819
+ "ipa": "/ˈwɔtər/",
820
+ "meaning": "nước",
821
+ "sentence": "I drink water every day.",
822
+ },
823
+ {
824
+ "word": "chair",
825
+ "ipa": "/tʃɛr/",
826
+ "meaning": "cái ghế",
827
+ "sentence": "Please sit on the chair.",
828
+ },
829
+ {
830
+ "word": "school",
831
+ "ipa": "/skul/",
832
+ "meaning": "trường học",
833
+ "sentence": "Children go to school.",
834
+ },
835
+ {
836
+ "word": "mother",
837
+ "ipa": "/ˈmʌðər/",
838
+ "meaning": "mẹ",
839
+ "sentence": "My mother is kind.",
840
+ },
841
+ {
842
+ "word": "house",
843
+ "ipa": "/haʊs/",
844
+ "meaning": "ngôi nhà",
845
+ "sentence": "I live in a big house.",
846
+ },
847
+ {
848
+ "word": "yellow",
849
+ "ipa": "/ˈjɛloʊ/",
850
+ "meaning": "màu vàng",
851
+ "sentence": "The sun is yellow.",
852
+ },
853
+ {
854
+ "word": "measure",
855
+ "ipa": "/ˈmɛʒər/",
856
+ "meaning": "đo lường",
857
+ "sentence": "Please measure the length.",
858
+ },
859
+ {
860
+ "word": "pleasure",
861
+ "ipa": "/ˈplɛʒər/",
862
+ "meaning": "niềm vui",
863
+ "sentence": "It's a pleasure to meet you.",
864
+ },
865
+ {
866
+ "word": "about",
867
+ "ipa": "/əˈbaʊt/",
868
+ "meaning": "về",
869
+ "sentence": "Tell me about your day.",
870
+ },
871
+ {
872
+ "word": "family",
873
+ "ipa": "/ˈfæməli/",
874
+ "meaning": "gia đình",
875
+ "sentence": "I love my family.",
876
+ },
877
+ ],
878
+ "hard": [
879
+ {
880
+ "word": "think",
881
+ "ipa": "/θɪŋk/",
882
+ "meaning": "suy nghĩ",
883
+ "sentence": "I think you are right.",
884
+ },
885
+ {
886
+ "word": "this",
887
+ "ipa": "/ðɪs/",
888
+ "meaning": "cái này",
889
+ "sentence": "This is my book.",
890
+ },
891
+ {
892
+ "word": "very",
893
+ "ipa": "/ˈvɛri/",
894
+ "meaning": "rất",
895
+ "sentence": "You are very smart.",
896
+ },
897
+ {
898
+ "word": "through",
899
+ "ipa": "/θru/",
900
+ "meaning": "qua",
901
+ "sentence": "Walk through the door.",
902
+ },
903
+ {
904
+ "word": "weather",
905
+ "ipa": "/ˈwɛðər/",
906
+ "meaning": "thời tiết",
907
+ "sentence": "The weather is nice.",
908
+ },
909
+ {
910
+ "word": "voice",
911
+ "ipa": "/vɔɪs/",
912
+ "meaning": "giọng nói",
913
+ "sentence": "She has a beautiful voice.",
914
+ },
915
+ {
916
+ "word": "clothes",
917
+ "ipa": "/kloʊðz/",
918
+ "meaning": "quần áo",
919
+ "sentence": "I need new clothes.",
920
+ },
921
+ {
922
+ "word": "breathe",
923
+ "ipa": "/brið/",
924
+ "meaning": "thở",
925
+ "sentence": "Breathe slowly and deeply.",
926
+ },
927
+ {
928
+ "word": "although",
929
+ "ipa": "/ɔlˈðoʊ/",
930
+ "meaning": "mặc dù",
931
+ "sentence": "Although it's cold, I'm happy.",
932
+ },
933
+ {
934
+ "word": "rhythm",
935
+ "ipa": "/ˈrɪðəm/",
936
+ "meaning": "nhịp điệu",
937
+ "sentence": "Music has a good rhythm.",
938
+ },
939
+ ],
940
+ }
941
+
942
+
943
+ @router.get("/symbols", response_model=List[IPASymbol])
944
+ async def get_ipa_symbols(
945
+ category: Optional[str] = Query(
946
+ None, description="Filter by category: vowel, consonant, diphthong"
947
+ )
948
+ ):
949
+ """Get all IPA symbols with Vietnamese tips and character mappings"""
950
+ try:
951
+ symbols = []
952
+ for symbol, data in IPA_SYMBOLS_DATA.items():
953
+ if category and data["category"] != category:
954
+ continue
955
+
956
+ # Get character mapping for the example word
957
+ character_mapping = map_ipa_to_characters(data["word"], symbol)
958
+
959
+ symbols.append(
960
+ IPASymbol(
961
+ symbol=symbol,
962
+ description=data["desc"],
963
+ example_word=data["word"],
964
+ category=data["category"],
965
+ difficulty_level=data["difficulty"],
966
+ vietnamese_tip=data["tip"],
967
+ character_mapping=character_mapping,
968
+ )
969
+ )
970
+
971
+ # Sort by difficulty and then by symbol
972
+ difficulty_order = {"easy": 0, "medium": 1, "hard": 2}
973
+ symbols.sort(key=lambda x: (difficulty_order[x.difficulty_level], x.symbol))
974
+
975
+ return symbols
976
+ except Exception as e:
977
+ logger.error(f"Error getting IPA symbols: {e}")
978
+ raise HTTPException(status_code=500, detail=str(e))
979
+
980
+
981
+ @router.get("/lessons", response_model=List[IPALesson])
982
+ async def get_ipa_lessons():
983
+ """Get structured IPA lessons for progressive learning"""
984
+ try:
985
+ lessons = [
986
+ {
987
+ "id": "vowels_basic",
988
+ "title": "Nguyên âm cơ bản (Basic Vowels)",
989
+ "description": "Học các nguyên âm đơn giản nhất trong tiếng Anh",
990
+ "symbols": [
991
+ s
992
+ for s in IPA_SYMBOLS_DATA.keys()
993
+ if IPA_SYMBOLS_DATA[s]["category"] == "vowel"
994
+ and IPA_SYMBOLS_DATA[s]["difficulty"] == "easy"
995
+ ],
996
+ "difficulty": "easy",
997
+ "estimated_time": 15,
998
+ },
999
+ {
1000
+ "id": "consonants_basic",
1001
+ "title": "Phụ âm cơ bản (Basic Consonants)",
1002
+ "description": "Các phụ âm dễ phát âm cho người Việt",
1003
+ "symbols": [
1004
+ s
1005
+ for s in IPA_SYMBOLS_DATA.keys()
1006
+ if IPA_SYMBOLS_DATA[s]["category"] == "consonant"
1007
+ and IPA_SYMBOLS_DATA[s]["difficulty"] == "easy"
1008
+ ],
1009
+ "difficulty": "easy",
1010
+ "estimated_time": 20,
1011
+ },
1012
+ {
1013
+ "id": "vowels_intermediate",
1014
+ "title": "Nguyên âm nâng cao (Intermediate Vowels)",
1015
+ "description": "Các nguyên âm khó hơn, cần luyện tập kỹ",
1016
+ "symbols": [
1017
+ s
1018
+ for s in IPA_SYMBOLS_DATA.keys()
1019
+ if IPA_SYMBOLS_DATA[s]["category"] == "vowel"
1020
+ and IPA_SYMBOLS_DATA[s]["difficulty"] == "medium"
1021
+ ],
1022
+ "difficulty": "medium",
1023
+ "estimated_time": 25,
1024
+ },
1025
+ {
1026
+ "id": "diphthongs",
1027
+ "title": "Nguyên âm đôi (Diphthongs)",
1028
+ "description": "Học cách phát âm nguyên âm đôi tự nhiên",
1029
+ "symbols": [
1030
+ s
1031
+ for s in IPA_SYMBOLS_DATA.keys()
1032
+ if IPA_SYMBOLS_DATA[s]["category"] == "diphthong"
1033
+ ],
1034
+ "difficulty": "medium",
1035
+ "estimated_time": 20,
1036
+ },
1037
+ {
1038
+ "id": "consonants_intermediate",
1039
+ "title": "Phụ âm trung cấp (Intermediate Consonants)",
1040
+ "description": "Các phụ âm cần luyện tập cho người Việt",
1041
+ "symbols": [
1042
+ s
1043
+ for s in IPA_SYMBOLS_DATA.keys()
1044
+ if IPA_SYMBOLS_DATA[s]["category"] == "consonant"
1045
+ and IPA_SYMBOLS_DATA[s]["difficulty"] == "medium"
1046
+ ],
1047
+ "difficulty": "medium",
1048
+ "estimated_time": 30,
1049
+ },
1050
+ {
1051
+ "id": "difficult_sounds",
1052
+ "title": "Âm khó (Difficult Sounds)",
1053
+ "description": "Những âm khó nhất cho người Việt: th, v, z",
1054
+ "symbols": [
1055
+ s
1056
+ for s in IPA_SYMBOLS_DATA.keys()
1057
+ if IPA_SYMBOLS_DATA[s]["difficulty"] == "hard"
1058
+ ],
1059
+ "difficulty": "hard",
1060
+ "estimated_time": 40,
1061
+ },
1062
+ ]
1063
+
1064
+ # Convert to proper lesson objects
1065
+ lesson_objects = []
1066
+ for lesson in lessons:
1067
+ symbol_objects = []
1068
+ for symbol_key in lesson["symbols"]:
1069
+ data = IPA_SYMBOLS_DATA[symbol_key]
1070
+ # Get character mapping for the example word
1071
+ character_mapping = map_ipa_to_characters(data["word"], symbol_key)
1072
+
1073
+ symbol_objects.append(
1074
+ IPASymbol(
1075
+ symbol=symbol_key,
1076
+ description=data["desc"],
1077
+ example_word=data["word"],
1078
+ category=data["category"],
1079
+ difficulty_level=data["difficulty"],
1080
+ vietnamese_tip=data["tip"],
1081
+ character_mapping=character_mapping,
1082
+ )
1083
+ )
1084
+
1085
+ lesson_objects.append(
1086
+ IPALesson(
1087
+ id=lesson["id"],
1088
+ title=lesson["title"],
1089
+ description=lesson["description"],
1090
+ symbols=symbol_objects,
1091
+ difficulty=lesson["difficulty"],
1092
+ estimated_time=lesson["estimated_time"],
1093
+ )
1094
+ )
1095
+
1096
+ return lesson_objects
1097
+ except Exception as e:
1098
+ logger.error(f"Error getting IPA lessons: {e}")
1099
+ raise HTTPException(status_code=500, detail=str(e))
1100
+
1101
+
1102
+ @router.get("/words", response_model=List[IPAWord])
1103
+ async def get_practice_words(
1104
+ difficulty: str = Query("easy", description="Difficulty level: easy, medium, hard")
1105
+ ):
1106
+ """Get practice words with IPA transcription and character mappings"""
1107
+ try:
1108
+ if difficulty not in ["easy", "medium", "hard"]:
1109
+ difficulty = "easy"
1110
+
1111
+ words_data = SAMPLE_WORDS.get(difficulty, SAMPLE_WORDS["easy"])
1112
+
1113
+ words = []
1114
+ for word_data in words_data:
1115
+ # Get phonemes using G2P
1116
+ try:
1117
+ phoneme_data = g2p.text_to_phonemes(word_data["word"])[0]
1118
+ phonemes = phoneme_data["phonemes"]
1119
+ except:
1120
+ # Fallback to simple conversion
1121
+ phonemes = list(word_data["word"].lower())
1122
+
1123
+ # Calculate difficulty
1124
+ difficulty_score = 0.0
1125
+ for phoneme in phonemes:
1126
+ difficulty_score += g2p.get_difficulty_score(phoneme)
1127
+ avg_difficulty = difficulty_score / len(phonemes) if phonemes else 0.3
1128
+
1129
+ word_difficulty = (
1130
+ "hard"
1131
+ if avg_difficulty > 0.6
1132
+ else "medium" if avg_difficulty > 0.4 else "easy"
1133
+ )
1134
+
1135
+ # Get character mapping for the word
1136
+ character_mapping = map_word_to_phonemes(
1137
+ word_data["word"], word_data["ipa"]
1138
+ )
1139
+
1140
+ words.append(
1141
+ IPAWord(
1142
+ word=word_data["word"],
1143
+ ipa=word_data["ipa"],
1144
+ phonemes=phonemes,
1145
+ difficulty=word_difficulty,
1146
+ meaning=word_data["meaning"],
1147
+ example_sentence=word_data["sentence"],
1148
+ character_mapping=character_mapping,
1149
+ )
1150
+ )
1151
+
1152
+ return words
1153
+ except Exception as e:
1154
+ logger.error(f"Error getting practice words: {e}")
1155
+ raise HTTPException(status_code=500, detail=str(e))
1156
+
1157
+
1158
+ @router.get("/exercises", response_model=List[IPAExercise])
1159
+ async def get_ipa_exercises(
1160
+ count: int = Query(5, ge=1, le=20), difficulty: str = Query("mixed")
1161
+ ):
1162
+ """Generate random IPA pronunciation exercises"""
1163
+ try:
1164
+ exercises = []
1165
+
1166
+ # Select words based on difficulty
1167
+ if difficulty == "mixed":
1168
+ all_words = []
1169
+ for level in SAMPLE_WORDS.values():
1170
+ all_words.extend(level)
1171
+ selected_words = random.sample(all_words, min(count, len(all_words)))
1172
+ else:
1173
+ if difficulty not in SAMPLE_WORDS:
1174
+ difficulty = "easy"
1175
+ word_pool = SAMPLE_WORDS[difficulty]
1176
+ selected_words = random.sample(word_pool, min(count, len(word_pool)))
1177
+
1178
+ for word_data in selected_words:
1179
+ # Get phonemes
1180
+ try:
1181
+ phoneme_data = g2p.text_to_phonemes(word_data["word"])[0]
1182
+ phonemes = phoneme_data["phonemes"]
1183
+ except:
1184
+ phonemes = list(word_data["word"].lower())
1185
+
1186
+ # Generate hints
1187
+ hints = [
1188
+ f"Nghĩa: {word_data['meaning']}",
1189
+ f"Ví dụ: {word_data['sentence']}",
1190
+ f"Số âm tiết: {len(phonemes)}",
1191
+ ]
1192
+
1193
+ # Add specific pronunciation hints for difficult sounds
1194
+ difficult_sounds = []
1195
+ for phoneme in phonemes:
1196
+ if phoneme in ["θ", "ð", "v", "z", "ʒ", "r", "w"]:
1197
+ difficult_sounds.append(phoneme)
1198
+
1199
+ if difficult_sounds:
1200
+ for sound in difficult_sounds:
1201
+ if sound in IPA_SYMBOLS_DATA:
1202
+ hints.append(f"Âm /{sound}/: {IPA_SYMBOLS_DATA[sound]['tip']}")
1203
+
1204
+ exercises.append(
1205
+ IPAExercise(
1206
+ word=word_data["word"],
1207
+ ipa=word_data["ipa"],
1208
+ phonemes=phonemes,
1209
+ hints=hints,
1210
+ difficulty=difficulty if difficulty != "mixed" else "easy",
1211
+ )
1212
+ )
1213
+
1214
+ return exercises
1215
+ except Exception as e:
1216
+ logger.error(f"Error generating IPA exercises: {e}")
1217
+ raise HTTPException(status_code=500, detail=str(e))
1218
+
1219
+
1220
+ @router.get("/symbol/{symbol}")
1221
+ async def get_symbol_details(symbol: str):
1222
+ """Get detailed information about a specific IPA symbol"""
1223
+ try:
1224
+ if symbol not in IPA_SYMBOLS_DATA:
1225
+ raise HTTPException(
1226
+ status_code=404, detail=f"IPA symbol '{symbol}' not found"
1227
+ )
1228
+
1229
+ data = IPA_SYMBOLS_DATA[symbol]
1230
+
1231
+ # Find words containing this symbol
1232
+ example_words = []
1233
+ for difficulty_level, words in SAMPLE_WORDS.items():
1234
+ for word_data in words:
1235
+ if symbol in word_data["ipa"]:
1236
+ example_words.append(
1237
+ {
1238
+ "word": word_data["word"],
1239
+ "ipa": word_data["ipa"],
1240
+ "meaning": word_data["meaning"],
1241
+ "difficulty": difficulty_level,
1242
+ }
1243
+ )
1244
+ if len(example_words) >= 5: # Limit to 5 examples
1245
+ break
1246
+ if len(example_words) >= 5:
1247
+ break
1248
+
1249
+ return {
1250
+ "symbol": symbol,
1251
+ "description": data["desc"],
1252
+ "example_word": data["word"],
1253
+ "category": data["category"],
1254
+ "difficulty_level": data["difficulty"],
1255
+ "vietnamese_tip": data["tip"],
1256
+ "difficulty_score": g2p.get_difficulty_score(symbol),
1257
+ "example_words": example_words,
1258
+ "practice_tips": _get_practice_tips(symbol),
1259
+ }
1260
+ except HTTPException:
1261
+ raise
1262
+ except Exception as e:
1263
+ logger.error(f"Error getting symbol details: {e}")
1264
+ raise HTTPException(status_code=500, detail=str(e))
1265
+
1266
+
1267
+ def _get_practice_tips(symbol: str) -> List[str]:
1268
+ """Get specific practice tips for a symbol"""
1269
+ tips_map = {
1270
+ "θ": [
1271
+ "Đặt đầu lưỡi giữa răng trên và răng dưới",
1272
+ "Thổi khí nhẹ qua kẽ răng",
1273
+ "Không rung dây thanh âm",
1274
+ "Luyện với từ: think, three, thank",
1275
+ ],
1276
+ "ð": [
1277
+ "Vị trí lưỡi giống như âm θ",
1278
+ "Nhưng phải rung dây thanh âm",
1279
+ "Cảm nhận rung động ở cổ họng",
1280
+ "Luyện với từ: this, that, brother",
1281
+ ],
1282
+ "v": [
1283
+ "Môi dưới chạm vào răng trên",
1284
+ "Không dùng cả hai môi như tiếng Việt",
1285
+ "Rung dây thanh âm",
1286
+ "Luyện với từ: very, voice, love",
1287
+ ],
1288
+ "r": [
1289
+ "Cuộn lưỡi nhẹ nhàng",
1290
+ "Không để lưỡi chạm vào vòm miệng",
1291
+ "Không lăn lưỡi như tiếng Việt",
1292
+ "Luyện với từ: red, run, car",
1293
+ ],
1294
+ "w": [
1295
+ "Tròn môi như phát âm 'u'",
1296
+ "Không dùng răng như âm 'v'",
1297
+ "Môi tròn rồi mở ra nhanh",
1298
+ "Luyện với từ: we, water, window",
1299
+ ],
1300
+ }
1301
+
1302
+ return tips_map.get(
1303
+ symbol,
1304
+ [
1305
+ f"Luyện phát âm âm /{symbol}/ thường xuyên",
1306
+ "Nghe và bắt chước người bản ngữ",
1307
+ "Tập trung vào vị trí lưỡi và môi",
1308
+ "Luyện tập với từ đơn giản trước",
1309
+ ],
1310
+ )
1311
+
1312
+
1313
+ @router.get("/word-analysis/{word}")
1314
+ async def get_word_analysis(word: str):
1315
+ """Get comprehensive analysis of a word for IPA learning"""
1316
+ try:
1317
+ # Get phoneme data
1318
+ phoneme_data = g2p.text_to_phonemes(word)[0]
1319
+
1320
+ # Calculate difficulty
1321
+ difficulty_scores = [
1322
+ g2p.get_difficulty_score(p) for p in phoneme_data["phonemes"]
1323
+ ]
1324
+ avg_difficulty = (
1325
+ sum(difficulty_scores) / len(difficulty_scores)
1326
+ if difficulty_scores
1327
+ else 0.3
1328
+ )
1329
+
1330
+ word_difficulty = (
1331
+ "hard"
1332
+ if avg_difficulty > 0.6
1333
+ else "medium" if avg_difficulty > 0.4 else "easy"
1334
+ )
1335
+
1336
+ # Get detailed phoneme analysis
1337
+ phoneme_analysis = []
1338
+ for i, phoneme in enumerate(phoneme_data["phonemes"]):
1339
+ difficulty_score = g2p.get_difficulty_score(phoneme)
1340
+
1341
+ analysis = {
1342
+ "phoneme": phoneme,
1343
+ "position": i,
1344
+ "difficulty_score": difficulty_score,
1345
+ "difficulty_level": (
1346
+ "hard"
1347
+ if difficulty_score > 0.6
1348
+ else "medium" if difficulty_score > 0.4 else "easy"
1349
+ ),
1350
+ "category": IPA_SYMBOLS_DATA.get(phoneme, {}).get(
1351
+ "category", "unknown"
1352
+ ),
1353
+ "vietnamese_tip": IPA_SYMBOLS_DATA.get(phoneme, {}).get(
1354
+ "tip", f"Luyện âm {phoneme}"
1355
+ ),
1356
+ "practice_tips": _get_practice_tips(phoneme),
1357
+ }
1358
+ phoneme_analysis.append(analysis)
1359
+
1360
+ # Find similar words for practice
1361
+ similar_words = []
1362
+ for difficulty_level, words in SAMPLE_WORDS.items():
1363
+ for word_data in words:
1364
+ if word_data["word"] != word:
1365
+ # Check if shares difficult phonemes
1366
+ word_phonemes = g2p.text_to_phonemes(word_data["word"])[0][
1367
+ "phonemes"
1368
+ ]
1369
+ shared_difficult = [
1370
+ p
1371
+ for p in phoneme_data["phonemes"]
1372
+ if p in word_phonemes and g2p.get_difficulty_score(p) > 0.5
1373
+ ]
1374
+ if shared_difficult:
1375
+ similar_words.append(
1376
+ {
1377
+ "word": word_data["word"],
1378
+ "ipa": word_data["ipa"],
1379
+ "meaning": word_data["meaning"],
1380
+ "shared_sounds": shared_difficult,
1381
+ "difficulty": difficulty_level,
1382
+ }
1383
+ )
1384
+ if len(similar_words) >= 5:
1385
+ break
1386
+ if len(similar_words) >= 5:
1387
+ break
1388
+
1389
+ return {
1390
+ "word": word,
1391
+ "ipa": phoneme_data["ipa"],
1392
+ "phonemes": phoneme_data["phonemes"],
1393
+ "phoneme_string": phoneme_data["phoneme_string"],
1394
+ "difficulty": word_difficulty,
1395
+ "difficulty_score": avg_difficulty,
1396
+ "phoneme_analysis": phoneme_analysis,
1397
+ "similar_words": similar_words,
1398
+ "practice_sequence": _generate_practice_sequence(phoneme_analysis),
1399
+ "common_mistakes": _get_common_mistakes(phoneme_data["phonemes"]),
1400
+ }
1401
+
1402
+ except Exception as e:
1403
+ logger.error(f"Error analyzing word '{word}': {e}")
1404
+ raise HTTPException(status_code=500, detail=str(e))
1405
+
1406
+
1407
+ def _generate_practice_sequence(phoneme_analysis: List[Dict]) -> List[Dict]:
1408
+ """Generate a practice sequence starting with easier sounds"""
1409
+ # Sort by difficulty
1410
+ sorted_phonemes = sorted(phoneme_analysis, key=lambda x: x["difficulty_score"])
1411
+
1412
+ sequence = []
1413
+ for phoneme_data in sorted_phonemes:
1414
+ step = {
1415
+ "step": len(sequence) + 1,
1416
+ "phoneme": phoneme_data["phoneme"],
1417
+ "focus": "Tập trung vào âm này",
1418
+ "tip": phoneme_data["vietnamese_tip"],
1419
+ "practice_words": _get_practice_words_for_phoneme(phoneme_data["phoneme"]),
1420
+ }
1421
+ sequence.append(step)
1422
+
1423
+ return sequence
1424
+
1425
+
1426
+ def _get_practice_words_for_phoneme(phoneme: str) -> List[str]:
1427
+ """Get simple words containing the phoneme"""
1428
+ practice_words = {
1429
+ "θ": ["think", "three", "month", "tooth"],
1430
+ "ð": ["this", "that", "mother", "brother"],
1431
+ "v": ["very", "voice", "love", "give"],
1432
+ "r": ["red", "run", "car", "tree"],
1433
+ "w": ["we", "water", "window", "want"],
1434
+ "z": ["zoo", "zero", "buzz", "pizza"],
1435
+ "ʒ": ["measure", "pleasure", "treasure", "vision"],
1436
+ "æ": ["cat", "hat", "man", "bad"],
1437
+ "ɪ": ["sit", "big", "win", "ship"],
1438
+ "ʊ": ["put", "look", "book", "good"],
1439
+ }
1440
+
1441
+ return practice_words.get(phoneme, [])
1442
+
1443
+
1444
+ def _get_common_mistakes(phonemes: List[str]) -> List[Dict]:
1445
+ """Get common pronunciation mistakes for Vietnamese speakers"""
1446
+ mistakes = []
1447
+
1448
+ common_mistakes_map = {
1449
+ "θ": {
1450
+ "mistake": "Phát âm thành 'f' hoặc 's'",
1451
+ "correction": "Đặt lưỡi giữa răng, thổi nhẹ",
1452
+ "examples": ["think → fink/sink (sai), think (đúng)"],
1453
+ },
1454
+ "ð": {
1455
+ "mistake": "Phát âm thành 'd' hoặc 'z'",
1456
+ "correction": "Lưỡi giữa răng + rung dây thanh",
1457
+ "examples": ["this → dis/zis (sai), this (đúng)"],
1458
+ },
1459
+ "v": {
1460
+ "mistake": "Phát âm thành 'w' hoặc 'b'",
1461
+ "correction": "Môi dưới chạm răng trên",
1462
+ "examples": ["very → wery/bery (sai), very (đúng)"],
1463
+ },
1464
+ "r": {
1465
+ "mistake": "Lăn lưỡi như tiếng Việt",
1466
+ "correction": "Cuộn lưỡi nhẹ, không chạm vòm",
1467
+ "examples": ["red → rrred (sai), red (đúng)"],
1468
+ },
1469
+ "w": {
1470
+ "mistake": "Phát âm thành 'v'",
1471
+ "correction": "Tròn môi, không dùng răng",
1472
+ "examples": ["we → ve (sai), we (đúng)"],
1473
+ },
1474
+ }
1475
+
1476
+ for phoneme in phonemes:
1477
+ if phoneme in common_mistakes_map:
1478
+ mistake_info = common_mistakes_map[phoneme]
1479
+ mistakes.append(
1480
+ {
1481
+ "phoneme": phoneme,
1482
+ "common_mistake": mistake_info["mistake"],
1483
+ "correction": mistake_info["correction"],
1484
+ "examples": mistake_info["examples"],
1485
+ }
1486
+ )
1487
+
1488
+ return mistakes
1489
+
1490
+
1491
+ @router.post("/assess-pronunciation")
1492
+ async def assess_ipa_pronunciation(
1493
+ audio_file: UploadFile = File(
1494
+ ..., description="Audio file for IPA pronunciation assessment"
1495
+ ),
1496
+ word: str = Form(..., description="Target word to assess"),
1497
+ target_ipa: str = Form(None, description="Target IPA transcription (optional)"),
1498
+ focus_phonemes: str = Form(
1499
+ None, description="Comma-separated list of phonemes to focus on (optional)"
1500
+ ),
1501
+ ):
1502
+ """
1503
+ Specialized IPA pronunciation assessment with detailed phoneme analysis
1504
+ Optimized for IPA learning with Vietnamese speaker feedback
1505
+ """
1506
+
1507
+ import tempfile
1508
+ import os
1509
+
1510
+ try:
1511
+ # Get the global assessor instance (singleton)
1512
+ assessor = get_assessor()
1513
+
1514
+ # Save uploaded audio file
1515
+ file_extension = ".wav"
1516
+ if audio_file.filename and "." in audio_file.filename:
1517
+ file_extension = f".{audio_file.filename.split('.')[-1]}"
1518
+
1519
+ with tempfile.NamedTemporaryFile(
1520
+ delete=False, suffix=file_extension
1521
+ ) as tmp_file:
1522
+ content = await audio_file.read()
1523
+ tmp_file.write(content)
1524
+ tmp_file.flush()
1525
+
1526
+ # Run standard pronunciation assessment
1527
+ result = assessor.assess_pronunciation(tmp_file.name, word, "word")
1528
+
1529
+ # Get target IPA and phonemes
1530
+ if not target_ipa:
1531
+ target_phonemes_data = g2p.text_to_phonemes(word)[0]
1532
+ target_ipa = target_phonemes_data["ipa"]
1533
+ target_phonemes = target_phonemes_data["phonemes"]
1534
+ else:
1535
+ # Parse IPA to phonemes (simplified)
1536
+ target_phonemes = target_ipa.replace("/", "").split()
1537
+
1538
+ # Focus phonemes analysis
1539
+ focus_phonemes_list = []
1540
+ if focus_phonemes:
1541
+ focus_phonemes_list = [p.strip() for p in focus_phonemes.split(",")]
1542
+
1543
+ # Enhanced IPA-specific analysis
1544
+ ipa_analysis = {
1545
+ "target_word": word,
1546
+ "target_ipa": target_ipa,
1547
+ "target_phonemes": target_phonemes,
1548
+ "user_transcript": result.get("transcript", ""),
1549
+ "user_ipa": result.get("user_ipa", ""),
1550
+ "user_phonemes": result.get("user_phonemes", ""),
1551
+ "overall_score": result.get("overall_score", 0.0),
1552
+ "phoneme_accuracy": result.get("phoneme_comparison", {}).get(
1553
+ "accuracy_percentage", 0
1554
+ ),
1555
+ "focus_phonemes_analysis": [],
1556
+ "vietnamese_specific_tips": [],
1557
+ "practice_recommendations": [],
1558
+ }
1559
+
1560
+ # Focus phonemes detailed analysis
1561
+ if focus_phonemes_list and result.get("phoneme_differences"):
1562
+ for phoneme_diff in result["phoneme_differences"]:
1563
+ ref_phoneme = phoneme_diff.get("reference_phoneme", "")
1564
+ if ref_phoneme in focus_phonemes_list:
1565
+ analysis = {
1566
+ "phoneme": ref_phoneme,
1567
+ "status": phoneme_diff.get("status", "unknown"),
1568
+ "score": phoneme_diff.get("score", 0.0),
1569
+ "difficulty": g2p.get_difficulty_score(ref_phoneme),
1570
+ "vietnamese_tip": IPA_SYMBOLS_DATA.get(ref_phoneme, {}).get(
1571
+ "tip", ""
1572
+ ),
1573
+ "practice_tip": _get_practice_tips(ref_phoneme),
1574
+ }
1575
+ ipa_analysis["focus_phonemes_analysis"].append(analysis)
1576
+
1577
+ # Vietnamese-specific pronunciation tips
1578
+ all_target_phonemes = target_phonemes + focus_phonemes_list
1579
+ vietnamese_tips = []
1580
+
1581
+ for phoneme in set(all_target_phonemes):
1582
+ if phoneme in [
1583
+ "θ",
1584
+ "ð",
1585
+ "v",
1586
+ "z",
1587
+ "ʒ",
1588
+ "r",
1589
+ "w",
1590
+ "æ",
1591
+ "ɪ",
1592
+ "ʊ",
1593
+ ]: # Difficult for Vietnamese
1594
+ tip_data = IPA_SYMBOLS_DATA.get(phoneme, {})
1595
+ if tip_data:
1596
+ vietnamese_tips.append(
1597
+ {
1598
+ "phoneme": phoneme,
1599
+ "tip": tip_data.get("tip", ""),
1600
+ "difficulty": tip_data.get("difficulty", "medium"),
1601
+ "category": tip_data.get("category", "unknown"),
1602
+ }
1603
+ )
1604
+
1605
+ ipa_analysis["vietnamese_specific_tips"] = vietnamese_tips
1606
+
1607
+ # Practice recommendations based on score
1608
+ if result.get("overall_score", 0) < 0.7:
1609
+ recommendations = [
1610
+ "Nghe từ mẫu nhiều lần trước khi phát âm",
1611
+ "Phát âm chậm và rõ ràng từng âm vị",
1612
+ "Chú ý đến vị trí lưỡi và môi khi phát âm",
1613
+ ]
1614
+
1615
+ # Add specific recommendations for low-scoring phonemes
1616
+ if result.get("wrong_words"):
1617
+ for wrong_word in result["wrong_words"][
1618
+ :2
1619
+ ]: # Top 2 problematic words
1620
+ for wrong_phoneme in wrong_word.get("wrong_phonemes", [])[:2]:
1621
+ phoneme = wrong_phoneme.get("expected", "")
1622
+ if phoneme in IPA_SYMBOLS_DATA:
1623
+ recommendations.append(
1624
+ f"Luyện đặc biệt âm /{phoneme}/: {IPA_SYMBOLS_DATA[phoneme]['tip']}"
1625
+ )
1626
+
1627
+ ipa_analysis["practice_recommendations"] = recommendations
1628
+
1629
+ # Combine with original result
1630
+ enhanced_result = {
1631
+ **result, # Original assessment result
1632
+ "ipa_analysis": ipa_analysis, # IPA-specific analysis
1633
+ "assessment_type": "ipa_focused",
1634
+ "target_ipa": target_ipa,
1635
+ "focus_phonemes": focus_phonemes_list,
1636
+ }
1637
+
1638
+ # Clean up temp file
1639
+ os.unlink(tmp_file.name)
1640
+
1641
+ logger.info(
1642
+ f"IPA assessment completed for word '{word}' with score {result.get('overall_score', 0):.2f}"
1643
+ )
1644
+
1645
+ return enhanced_result
1646
+
1647
+ except Exception as e:
1648
+ logger.error(f"IPA pronunciation assessment error: {e}")
1649
+ raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
1650
+
1651
+
1652
+ @router.get("/practice-session/{lesson_id}")
1653
+ async def create_ipa_practice_session(lesson_id: str):
1654
+ """Create a structured IPA practice session"""
1655
+ try:
1656
+ # This would typically fetch from a database
1657
+ # For now, we'll create a sample session based on lesson_id
1658
+
1659
+ if lesson_id == "vowels_basic":
1660
+ session_words = [
1661
+ {
1662
+ "word": "cat",
1663
+ "ipa": "/kæt/",
1664
+ "focus_phonemes": ["æ"],
1665
+ "mapping": map_word_to_phonemes("cat", "/kæt/"),
1666
+ },
1667
+ {
1668
+ "word": "bed",
1669
+ "ipa": "/bɛd/",
1670
+ "focus_phonemes": ["ɛ"],
1671
+ "mapping": map_word_to_phonemes("bed", "/bɛd/"),
1672
+ },
1673
+ {
1674
+ "word": "see",
1675
+ "ipa": "/si/",
1676
+ "focus_phonemes": ["i"],
1677
+ "mapping": map_word_to_phonemes("see", "/si/"),
1678
+ },
1679
+ {
1680
+ "word": "cup",
1681
+ "ipa": "/kʌp/",
1682
+ "focus_phonemes": ["ʌ"],
1683
+ "mapping": map_word_to_phonemes("cup", "/kʌp/"),
1684
+ },
1685
+ {
1686
+ "word": "book",
1687
+ "ipa": "/bʊk/",
1688
+ "focus_phonemes": ["ʊ"],
1689
+ "mapping": map_word_to_phonemes("book", "/bʊk/"),
1690
+ },
1691
+ ]
1692
+ elif lesson_id == "difficult_sounds":
1693
+ session_words = [
1694
+ {
1695
+ "word": "think",
1696
+ "ipa": "/θɪŋk/",
1697
+ "focus_phonemes": ["θ"],
1698
+ "mapping": map_word_to_phonemes("think", "/θɪŋk/"),
1699
+ },
1700
+ {
1701
+ "word": "this",
1702
+ "ipa": "/ðɪs/",
1703
+ "focus_phonemes": ["ð"],
1704
+ "mapping": map_word_to_phonemes("this", "/ðɪs/"),
1705
+ },
1706
+ {
1707
+ "word": "very",
1708
+ "ipa": "/ˈvɛri/",
1709
+ "focus_phonemes": ["v"],
1710
+ "mapping": map_word_to_phonemes("very", "/ˈvɛri/"),
1711
+ },
1712
+ {
1713
+ "word": "water",
1714
+ "ipa": "/ˈwɔtər/",
1715
+ "focus_phonemes": ["w"],
1716
+ "mapping": map_word_to_phonemes("water", "/ˈwɔtər/"),
1717
+ },
1718
+ {
1719
+ "word": "red",
1720
+ "ipa": "/rɛd/",
1721
+ "focus_phonemes": ["r"],
1722
+ "mapping": map_word_to_phonemes("red", "/rɛd/"),
1723
+ },
1724
+ ]
1725
+ else:
1726
+ # Default session
1727
+ session_words = [
1728
+ {
1729
+ "word": "hello",
1730
+ "ipa": "/həˈloʊ/",
1731
+ "focus_phonemes": ["ə", "oʊ"],
1732
+ "mapping": map_word_to_phonemes("hello", "/həˈloʊ/"),
1733
+ },
1734
+ {
1735
+ "word": "world",
1736
+ "ipa": "/wɜrld/",
1737
+ "focus_phonemes": ["w", "ɜr"],
1738
+ "mapping": map_word_to_phonemes("world", "/wɜrld/"),
1739
+ },
1740
+ {
1741
+ "word": "practice",
1742
+ "ipa": "/ˈpræktɪs/",
1743
+ "focus_phonemes": ["æ", "ɪ"],
1744
+ "mapping": map_word_to_phonemes("practice", "/ˈpræktɪs/"),
1745
+ },
1746
+ ]
1747
+
1748
+ return {
1749
+ "session_id": lesson_id,
1750
+ "title": f"IPA Practice Session: {lesson_id.replace('_', ' ').title()}",
1751
+ "words": session_words,
1752
+ "estimated_time": len(session_words) * 3, # 3 minutes per word
1753
+ "instructions": [
1754
+ "Nghe mẫu từng từ carefully",
1755
+ "Tập trung vào âm vị được highlight",
1756
+ "Ghi âm nhiều lần cho đến khi đạt điểm tốt",
1757
+ "Đọc feedback để cải thiện",
1758
+ ],
1759
+ }
1760
+
1761
+ except Exception as e:
1762
+ logger.error(f"Error creating practice session: {e}")
1763
+ raise HTTPException(status_code=500, detail=str(e))
src/apis/routes/speaking_route.py CHANGED
@@ -9,7 +9,7 @@ from loguru import logger
9
  from src.utils.speaking_utils import convert_numpy_types
10
 
11
  # Import the new evaluation system
12
- from evalution import ProductionPronunciationAssessor, EnhancedG2P
13
  warnings.filterwarnings("ignore")
14
 
15
  router = APIRouter(prefix="/speaking", tags=["Speaking"])
@@ -36,7 +36,16 @@ class PronunciationAssessmentResult(BaseModel):
36
  assessment_mode: Optional[str] = None
37
  character_level_analysis: Optional[bool] = None
38
 
39
- assessor = ProductionPronunciationAssessor()
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  @router.post("/assess", response_model=PronunciationAssessmentResult)
@@ -103,7 +112,8 @@ async def assess_pronunciation(
103
 
104
  logger.info(f"Processing audio file: {tmp_file.name} with mode: {mode}")
105
 
106
- # Run assessment using enhanced assessor
 
107
  result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
108
 
109
  # Get reference phonemes and IPA
 
9
  from src.utils.speaking_utils import convert_numpy_types
10
 
11
  # Import the new evaluation system
12
+ from src.apis.controllers.speaking_controller import ProductionPronunciationAssessor, EnhancedG2P
13
  warnings.filterwarnings("ignore")
14
 
15
  router = APIRouter(prefix="/speaking", tags=["Speaking"])
 
36
  assessment_mode: Optional[str] = None
37
  character_level_analysis: Optional[bool] = None
38
 
39
+ # Global assessor instance - singleton pattern for performance
40
+ global_assessor = None
41
+
42
+ def get_assessor():
43
+ """Get or create the global assessor instance"""
44
+ global global_assessor
45
+ if global_assessor is None:
46
+ logger.info("Creating global ProductionPronunciationAssessor instance...")
47
+ global_assessor = ProductionPronunciationAssessor()
48
+ return global_assessor
49
 
50
 
51
  @router.post("/assess", response_model=PronunciationAssessmentResult)
 
112
 
113
  logger.info(f"Processing audio file: {tmp_file.name} with mode: {mode}")
114
 
115
+ # Run assessment using enhanced assessor (singleton)
116
+ assessor = get_assessor()
117
  result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
118
 
119
  # Get reference phonemes and IPA