HK0712 commited on
Commit
c2784ee
·
1 Parent(s): 6f1d412

final fxied 1 word > 2 ipa issue

Browse files
Files changed (1) hide show
  1. analyzer/ASR_en_us.py +125 -48
analyzer/ASR_en_us.py CHANGED
@@ -51,9 +51,9 @@ def _tokenize_ipa(ipa_string: str) -> list:
51
  """
52
  將 IPA 字串智能地切分為音素列表,能正確處理多字元音素。
53
  """
 
54
  phonemes = []
55
  i = 0
56
- s = ipa_string.replace(' ', '')
57
  while i < len(s):
58
  if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
59
  phonemes.append(s[i:i+2])
@@ -63,6 +63,74 @@ def _tokenize_ipa(ipa_string: str) -> list:
63
  i += 1
64
  return phonemes
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
67
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
68
  """
@@ -71,10 +139,8 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
71
  """
72
  # 檢查快取中是否已有模型,如果沒有則載入
73
  if "model" not in cache:
74
- print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
75
  try:
76
- # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
77
- # 載入模型並存入此函數的快取字典
78
  cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
79
  cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
80
  cache["model"].to(DEVICE)
@@ -87,15 +153,9 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
87
  processor = cache["processor"]
88
  model = cache["model"]
89
 
90
- # --- 以下為原始分析邏輯,保持不變 ---
91
- target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
92
 
93
- target_ipa_by_word = [
94
- _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
95
- for word in target_ipa_by_word_str
96
- ]
97
- target_words_original = target_sentence.split()
98
-
99
  try:
100
  speech, sample_rate = sf.read(audio_file_path)
101
  if sample_rate != 16000:
@@ -109,8 +169,6 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
109
  logits = model(input_values).logits
110
  predicted_ids = torch.argmax(logits, dim=-1)
111
 
112
- # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
113
- # 【保持不變】
114
  raw_user_ipa_str = processor.decode(predicted_ids[0])
115
  raw_user_phonemes = raw_user_ipa_str.split(' ')
116
  normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
@@ -160,23 +218,40 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
160
  word_start_idx_in_path = 0
161
  target_phoneme_counter_in_path = 0
162
 
 
 
 
 
 
 
163
  for path_idx, p in enumerate(target_path):
164
  if p != '-':
165
  if target_phoneme_counter_in_path in word_boundaries_indices:
166
- target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
167
- user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
168
-
169
- alignments_by_word.append({
170
- "target": target_alignment,
171
- "user": user_alignment
172
- })
173
-
174
- word_start_idx_in_path = path_idx + 1
175
-
 
 
176
  target_phoneme_counter_in_path += 1
177
-
 
 
 
 
 
 
 
 
178
  return alignments_by_word
179
 
 
180
  # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
181
  # 【保持不變】
182
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
@@ -192,23 +267,27 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
192
  word_is_correct = True
193
  phonemes_data = []
194
 
195
- for j in range(len(alignment['target'])):
196
- target_phoneme = alignment['target'][j]
197
- user_phoneme = alignment['user'][j]
198
- is_match = (user_phoneme == target_phoneme)
199
-
200
- phonemes_data.append({
201
- "target": target_phoneme,
202
- "user": user_phoneme,
203
- "isMatch": is_match
204
- })
205
-
206
- if not is_match:
207
- word_is_correct = False
208
- if not (user_phoneme == '-' and target_phoneme == '-'):
209
- total_errors += 1
210
-
211
- if word_is_correct:
 
 
 
 
212
  correct_words_count += 1
213
 
214
  words_data.append({
@@ -217,12 +296,10 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
217
  "phonemes": phonemes_data
218
  })
219
 
220
- total_phonemes += sum(1 for p in alignment['target'] if p != '-')
221
-
222
  total_words = len(original_words)
223
- if len(alignments) < total_words:
224
- for i in range(len(alignments), total_words):
225
- missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
226
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
227
  phonemes_data = []
228
  for p_ipa in missed_word_ipa:
@@ -253,4 +330,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
253
  "words": words_data
254
  }
255
 
256
- return final_result
 
51
  """
52
  將 IPA 字串智能地切分為音素列表,能正確處理多字元音素。
53
  """
54
+ s = ipa_string.replace(' ', '').replace('ˌ', '').replace('ˈ', '').replace('ː', '')
55
  phonemes = []
56
  i = 0
 
57
  while i < len(s):
58
  if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
59
  phonemes.append(s[i:i+2])
 
63
  i += 1
64
  return phonemes
65
 
66
+ # 【【【【【 全新函式:智慧 G2P 歸屬邏輯 - 方案 B 版本 】】】】】
67
+ def _get_target_ipa_by_word(sentence: str) -> (list, list):
68
+ """
69
+ 使用「啟發式拆分」方法(方案B),將句子級 G2P 結果智慧地歸屬到每個單字。
70
+ """
71
+ original_words = sentence.strip().split()
72
+
73
+ # 1. 獲取句子級別的 G2P 結果
74
+ sentence_ipa_groups_raw = [s.strip('[]') for s in phonemize(sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()]
75
+ sentence_ipa_groups = [_tokenize_ipa(group) for group in sentence_ipa_groups_raw]
76
+
77
+ # 如果數量剛好匹配,直接返回,這是最理想的情況
78
+ if len(original_words) == len(sentence_ipa_groups):
79
+ print("G2P alignment perfect match. No heuristic needed.")
80
+ return original_words, sentence_ipa_groups
81
+
82
+ # 2. 數量不匹配,啟用啟發式歸屬邏輯
83
+ print(f"G2P Mismatch Detected: {len(original_words)} words vs {len(sentence_ipa_groups)} IPA groups. Applying heuristic splitting.")
84
+
85
+ # 獲取單字級別的 G2P 結果作為參考
86
+ word_ipas_reference = [_tokenize_ipa(phonemize(word, language='en-us', backend='espeak', strip=True)) for word in original_words]
87
+
88
+ final_ipa_by_word = []
89
+ word_idx = 0
90
+ ipa_group_idx = 0
91
+
92
+ while word_idx < len(original_words):
93
+ # 邊界檢查:如果句子級音標已經用完
94
+ if ipa_group_idx >= len(sentence_ipa_groups):
95
+ print(f"Warning: Ran out of sentence IPA groups. Appending reference IPA for '{original_words[word_idx]}'.")
96
+ final_ipa_by_word.append(word_ipas_reference[word_idx])
97
+ word_idx += 1
98
+ continue
99
+
100
+ current_word = original_words[word_idx]
101
+ current_ipa_group = sentence_ipa_groups[ipa_group_idx]
102
+ ref_ipa_len = len(word_ipas_reference[word_idx])
103
+
104
+ # 啟發式核心:如果當前句子級音標組比參考音標長,且這不是最後一個詞
105
+ if len(current_ipa_group) > ref_ipa_len and word_idx + 1 < len(original_words):
106
+ # 假設多出來的部分屬於下一個詞
107
+ print(f"Heuristic Split: Splitting IPA group for '{current_word}' and '{original_words[word_idx+1]}'.")
108
+
109
+ # 切分!
110
+ ipa_for_current_word = current_ipa_group[:ref_ipa_len]
111
+ ipa_for_next_word = current_ipa_group[ref_ipa_len:]
112
+
113
+ final_ipa_by_word.append(ipa_for_current_word)
114
+ final_ipa_by_word.append(ipa_for_next_word)
115
+
116
+ # 一次處理了兩個詞,所以索引都要加 2
117
+ word_idx += 2
118
+ ipa_group_idx += 1
119
+ else:
120
+ # 正常情況:長度匹配或無法應用啟發式規則
121
+ final_ipa_by_word.append(current_ipa_group)
122
+ word_idx += 1
123
+ ipa_group_idx += 1
124
+
125
+ # 最後的長度校驗,如果不匹配,證明啟發式失敗,執行最終回退
126
+ if len(final_ipa_by_word) != len(original_words):
127
+ print(f"Heuristic splitting failed (final count: {len(final_ipa_by_word)} vs {len(original_words)}). Falling back to word-by-word G2P for safety.")
128
+ return original_words, word_ipas_reference
129
+
130
+ print("Heuristic splitting successful.")
131
+ return original_words, final_ipa_by_word
132
+
133
+
134
  # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
135
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
136
  """
 
139
  """
140
  # 檢查快取中是否已有模型,如果沒有則載入
141
  if "model" not in cache:
142
+ print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
143
  try:
 
 
144
  cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
145
  cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
146
  cache["model"].to(DEVICE)
 
153
  processor = cache["processor"]
154
  model = cache["model"]
155
 
156
+ # --- 【【【【【 主要修改點:使用新的智慧 G2P 函式 】】】】】 ---
157
+ target_words_original, target_ipa_by_word = _get_target_ipa_by_word(target_sentence)
158
 
 
 
 
 
 
 
159
  try:
160
  speech, sample_rate = sf.read(audio_file_path)
161
  if sample_rate != 16000:
 
169
  logits = model(input_values).logits
170
  predicted_ids = torch.argmax(logits, dim=-1)
171
 
 
 
172
  raw_user_ipa_str = processor.decode(predicted_ids[0])
173
  raw_user_phonemes = raw_user_ipa_str.split(' ')
174
  normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
 
218
  word_start_idx_in_path = 0
219
  target_phoneme_counter_in_path = 0
220
 
221
+ num_words_to_align = len(target_words_ipa_tokenized)
222
+ current_word_idx = 0
223
+
224
+ if not target_path:
225
+ return []
226
+
227
  for path_idx, p in enumerate(target_path):
228
  if p != '-':
229
  if target_phoneme_counter_in_path in word_boundaries_indices:
230
+ if current_word_idx < num_words_to_align:
231
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
232
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
233
+
234
+ alignments_by_word.append({
235
+ "target": target_alignment,
236
+ "user": user_alignment
237
+ })
238
+
239
+ word_start_idx_in_path = path_idx + 1
240
+ current_word_idx += 1
241
+
242
  target_phoneme_counter_in_path += 1
243
+
244
+ if word_start_idx_in_path < len(target_path) and current_word_idx < num_words_to_align:
245
+ target_alignment = target_path[word_start_idx_in_path:]
246
+ user_alignment = user_path[word_start_idx_in_path:]
247
+ alignments_by_word.append({
248
+ "target": target_alignment,
249
+ "user": user_alignment
250
+ })
251
+
252
  return alignments_by_word
253
 
254
+
255
  # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
256
  # 【保持不變】
257
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
 
267
  word_is_correct = True
268
  phonemes_data = []
269
 
270
+ if not alignment or not alignment.get('target'):
271
+ word_is_correct = False
272
+ else:
273
+ for j in range(len(alignment['target'])):
274
+ target_phoneme = alignment['target'][j]
275
+ user_phoneme = alignment['user'][j]
276
+ is_match = (user_phoneme == target_phoneme)
277
+
278
+ phonemes_data.append({
279
+ "target": target_phoneme,
280
+ "user": user_phoneme,
281
+ "isMatch": is_match
282
+ })
283
+
284
+ if not is_match:
285
+ word_is_correct = False
286
+ if not (user_phoneme == '-' and target_phoneme == '-'):
287
+ total_errors += 1
288
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
289
+
290
+ if word_is_correct and phonemes_data:
291
  correct_words_count += 1
292
 
293
  words_data.append({
 
296
  "phonemes": phonemes_data
297
  })
298
 
 
 
299
  total_words = len(original_words)
300
+ if len(words_data) < total_words:
301
+ for i in range(len(words_data), total_words):
302
+ missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True)
303
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
304
  phonemes_data = []
305
  for p_ipa in missed_word_ipa:
 
330
  "words": words_data
331
  }
332
 
333
+ return final_result