HK0712 commited on
Commit
a6526f0
·
1 Parent(s): cd315e7

CHANGE: keep load in ram

Browse files
analyzer/ASR_de_de.py CHANGED
@@ -1,239 +1,221 @@
1
- # ASR_en_us.py
2
-
3
- import torch
4
- import soundfile as sf
5
- import librosa
6
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
- import os
8
- from phonemizer import phonemize
9
- import numpy as np
10
- from datetime import datetime, timezone
11
-
12
- # 【【【【【 新增程式碼 #1:自動檢測可用設備 】】】】】
13
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
- print(f"INFO: ASR_de_de.py is configured to use device: {DEVICE}")
15
-
16
- # --- 1. 全域設定與模型載入函數 (保持不變) ---
17
- MODEL_NAME = "HK0712/Wav2Vec2_German_IPA"
18
-
19
- processor = None
20
- model = None
21
-
22
- def load_model():
23
- """
24
- (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
25
- 它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
26
- """
27
- global processor, model
28
- if processor and model:
29
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
30
- return True
31
-
32
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
33
- print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
34
- try:
35
- # 直接使用模型的線上名稱調用 from_pretrained
36
- # 這就是魔法發生的地方!
37
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
38
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
39
-
40
- model.to(DEVICE)
41
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
42
- return True
43
- except Exception as e:
44
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
45
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
46
-
47
- # --- 2. 智能 IPA 切分函數 (已更新) ---
48
- # 移除了包含 'ː' 的組合,因為我們將在源頭移除它
49
- MULTI_CHAR_PHONEMES = {
50
- 'aɪ', 'aʊ',
51
- 'dʒ', 'pf', 'ts', 'tʃ'
52
- }
53
-
54
- def _tokenize_ipa(ipa_string: str) -> list:
55
- """
56
- IPA 字串智能地切分為音素列表,能正確處理多字元音素。
57
- """
58
- phonemes = []
59
- i = 0
60
- s = ipa_string.replace(' ', '')
61
- while i < len(s):
62
- if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
63
- phonemes.append(s[i:i+2])
64
- i += 2
65
- else:
66
- phonemes.append(s[i])
67
- i += 1
68
- return phonemes
69
-
70
- # --- 3. 核心分析函數 (主入口) (已修改) ---
71
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
72
- """
73
- 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
74
- 這是此模組的主要進入點。
75
- """
76
- if not processor or not model:
77
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
78
-
79
- target_ipa_by_word_str = phonemize(target_sentence, language='de', backend='espeak', with_stress=True, strip=True).split()
80
-
81
- # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
82
- # 在切分前,移除所有重音和長音符號,以匹配 ASR 的輸出特性
83
- target_ipa_by_word = [
84
- _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
85
- for word in target_ipa_by_word_str
86
- ]
87
- target_words_original = target_sentence.split()
88
-
89
- try:
90
- speech, sample_rate = sf.read(audio_file_path)
91
- if sample_rate != 16000:
92
- speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
93
- except Exception as e:
94
- raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
95
-
96
- input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
97
- input_values = input_values.to(DEVICE)
98
- with torch.no_grad():
99
- logits = model(input_values).logits
100
- predicted_ids = torch.argmax(logits, dim=-1)
101
- user_ipa_full = processor.decode(predicted_ids[0])
102
-
103
- word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
104
-
105
- return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
106
-
107
-
108
- # --- 4. 對齊函數 (與上一版相同) ---
109
- def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
110
- """
111
- (已修改) 使用新的切分邏輯執行音素對齊。
112
- """
113
- user_phonemes = _tokenize_ipa(user_phoneme_str)
114
-
115
- target_phonemes_flat = []
116
- word_boundaries_indices = []
117
- current_idx = 0
118
- for word_ipa_tokens in target_words_ipa_tokenized:
119
- target_phonemes_flat.extend(word_ipa_tokens)
120
- current_idx += len(word_ipa_tokens)
121
- word_boundaries_indices.append(current_idx - 1)
122
-
123
- dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
124
- for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
125
- for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
126
- for i in range(1, len(user_phonemes) + 1):
127
- for j in range(1, len(target_phonemes_flat) + 1):
128
- cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
129
- dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
130
-
131
- i, j = len(user_phonemes), len(target_phonemes_flat)
132
- user_path, target_path = [], []
133
- while i > 0 or j > 0:
134
- cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
135
- if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
136
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
137
- elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
138
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
139
- else:
140
- user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
141
-
142
- alignments_by_word = []
143
- word_start_idx_in_path = 0
144
- target_phoneme_counter_in_path = 0
145
-
146
- for path_idx, p in enumerate(target_path):
147
- if p != '-':
148
- if target_phoneme_counter_in_path in word_boundaries_indices:
149
- target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
150
- user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
151
-
152
- alignments_by_word.append({
153
- "target": target_alignment,
154
- "user": user_alignment
155
- })
156
-
157
- word_start_idx_in_path = path_idx + 1
158
-
159
- target_phoneme_counter_in_path += 1
160
-
161
- return alignments_by_word
162
-
163
- # --- 5. 格式化函數 (與上一版相同) ---
164
- def _format_to_json_structure(alignments, sentence, original_words) -> dict:
165
- total_phonemes = 0
166
- total_errors = 0
167
- correct_words_count = 0
168
- words_data = []
169
-
170
- num_words_to_process = min(len(alignments), len(original_words))
171
-
172
- for i in range(num_words_to_process):
173
- alignment = alignments[i]
174
- word_is_correct = True
175
- phonemes_data = []
176
-
177
- for j in range(len(alignment['target'])):
178
- target_phoneme = alignment['target'][j]
179
- user_phoneme = alignment['user'][j]
180
- is_match = (user_phoneme == target_phoneme)
181
-
182
- phonemes_data.append({
183
- "target": target_phoneme,
184
- "user": user_phoneme,
185
- "isMatch": is_match
186
- })
187
-
188
- if not is_match:
189
- word_is_correct = False
190
- if not (user_phoneme == '-' and target_phoneme == '-'):
191
- total_errors += 1
192
-
193
- if word_is_correct:
194
- correct_words_count += 1
195
-
196
- words_data.append({
197
- "word": original_words[i],
198
- "isCorrect": word_is_correct,
199
- "phonemes": phonemes_data
200
- })
201
-
202
- total_phonemes += sum(1 for p in alignment['target'] if p != '-')
203
-
204
- total_words = len(original_words)
205
- if len(alignments) < total_words:
206
- for i in range(len(alignments), total_words):
207
- # 確保這裡也移除 'ː'
208
- missed_word_ipa_str = phonemize(original_words[i], language='de', backend='espeak', strip=True).replace('ː', '')
209
- missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
210
- phonemes_data = []
211
- for p_ipa in missed_word_ipa:
212
- phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
213
- total_errors += 1
214
- total_phonemes += 1
215
-
216
- words_data.append({
217
- "word": original_words[i],
218
- "isCorrect": False,
219
- "phonemes": phonemes_data
220
- })
221
-
222
- overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
223
- phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
224
-
225
- final_result = {
226
- "sentence": sentence,
227
- "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
228
- "summary": {
229
- "overallScore": round(overall_score, 1),
230
- "totalWords": total_words,
231
- "correctWords": correct_words_count,
232
- "phonemeErrorRate": round(phoneme_error_rate, 2),
233
- "total_errors": total_errors,
234
- "total_target_phonemes": total_phonemes
235
- },
236
- "words": words_data
237
- }
238
-
239
- return final_result
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import os
6
+ from phonemizer import phonemize
7
+ import numpy as np
8
+ from datetime import datetime, timezone
9
+
10
+ # --- 1. 全域設定與模型載入函數 (已修改) ---
11
+ # 移除了全域的 processor 和 model 變數。
12
+ # 刪除了舊的 load_model() 函數。
13
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(f"INFO: ASR_de_de.py is configured to use device: {DEVICE}")
15
+ MODEL_NAME = "HK0712/Wav2Vec2_German_IPA"
16
+
17
+ # --- 2. 智能 IPA 切分函數 (保持不變) ---
18
+ MULTI_CHAR_PHONEMES = {
19
+ 'aɪ', 'aʊ',
20
+ 'dʒ', 'pf', 'ts', 'tʃ'
21
+ }
22
+
23
+ def _tokenize_ipa(ipa_string: str) -> list:
24
+ """
25
+ IPA 字串智能地切分為音素列表,能正確處理多字元音素。
26
+ """
27
+ phonemes = []
28
+ i = 0
29
+ s = ipa_string.replace(' ', '')
30
+ while i < len(s):
31
+ if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
32
+ phonemes.append(s[i:i+2])
33
+ i += 2
34
+ else:
35
+ phonemes.append(s[i])
36
+ i += 1
37
+ return phonemes
38
+
39
+ # --- 3. 核心分析函數 (主入口) (已修改) ---
40
+ # 將模型載入和快取邏輯合併至此。
41
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
42
+ """
43
+ 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
44
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
45
+ """
46
+ # 檢查快取中是否已有模型,如果沒有則載入
47
+ if "model" not in cache:
48
+ print(f"快取未命中 (ASR_de_de)。正在載入模型 '{MODEL_NAME}'...")
49
+ try:
50
+ # 載入模型並存入此函數的快取字典
51
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
52
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
53
+ cache["model"].to(DEVICE)
54
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
55
+ except Exception as e:
56
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
57
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
58
+
59
+ # 從此函數的獨立快取中獲取模型和處理器
60
+ processor = cache["processor"]
61
+ model = cache["model"]
62
+
63
+ # --- 以下為原始分析邏輯,保持不變 ---
64
+ target_ipa_by_word_str = phonemize(target_sentence, language='de', backend='espeak', with_stress=True, strip=True).split()
65
+
66
+ target_ipa_by_word = [
67
+ _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
68
+ for word in target_ipa_by_word_str
69
+ ]
70
+ target_words_original = target_sentence.split()
71
+
72
+ try:
73
+ speech, sample_rate = sf.read(audio_file_path)
74
+ if sample_rate != 16000:
75
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
76
+ except Exception as e:
77
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
78
+
79
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
80
+ input_values = input_values.to(DEVICE)
81
+ with torch.no_grad():
82
+ logits = model(input_values).logits
83
+ predicted_ids = torch.argmax(logits, dim=-1)
84
+ user_ipa_full = processor.decode(predicted_ids[0])
85
+
86
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
87
+
88
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
89
+
90
+
91
+ # --- 4. 對齊函數 (保持不變) ---
92
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
93
+ """
94
+ (已修改) 使用新的切分邏輯執行音素對齊。
95
+ """
96
+ user_phonemes = _tokenize_ipa(user_phoneme_str)
97
+
98
+ target_phonemes_flat = []
99
+ word_boundaries_indices = []
100
+ current_idx = 0
101
+ for word_ipa_tokens in target_words_ipa_tokenized:
102
+ target_phonemes_flat.extend(word_ipa_tokens)
103
+ current_idx += len(word_ipa_tokens)
104
+ word_boundaries_indices.append(current_idx - 1)
105
+
106
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
107
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
108
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
109
+ for i in range(1, len(user_phonemes) + 1):
110
+ for j in range(1, len(target_phonemes_flat) + 1):
111
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
112
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
113
+
114
+ i, j = len(user_phonemes), len(target_phonemes_flat)
115
+ user_path, target_path = [], []
116
+ while i > 0 or j > 0:
117
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
118
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
119
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
120
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
121
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
122
+ else:
123
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
124
+
125
+ alignments_by_word = []
126
+ word_start_idx_in_path = 0
127
+ target_phoneme_counter_in_path = 0
128
+
129
+ for path_idx, p in enumerate(target_path):
130
+ if p != '-':
131
+ if target_phoneme_counter_in_path in word_boundaries_indices:
132
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
133
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
134
+
135
+ alignments_by_word.append({
136
+ "target": target_alignment,
137
+ "user": user_alignment
138
+ })
139
+
140
+ word_start_idx_in_path = path_idx + 1
141
+
142
+ target_phoneme_counter_in_path += 1
143
+
144
+ return alignments_by_word
145
+
146
+ # --- 5. 格式化函數 (保持不變) ---
147
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
148
+ total_phonemes = 0
149
+ total_errors = 0
150
+ correct_words_count = 0
151
+ words_data = []
152
+
153
+ num_words_to_process = min(len(alignments), len(original_words))
154
+
155
+ for i in range(num_words_to_process):
156
+ alignment = alignments[i]
157
+ word_is_correct = True
158
+ phonemes_data = []
159
+
160
+ for j in range(len(alignment['target'])):
161
+ target_phoneme = alignment['target'][j]
162
+ user_phoneme = alignment['user'][j]
163
+ is_match = (user_phoneme == target_phoneme)
164
+
165
+ phonemes_data.append({
166
+ "target": target_phoneme,
167
+ "user": user_phoneme,
168
+ "isMatch": is_match
169
+ })
170
+
171
+ if not is_match:
172
+ word_is_correct = False
173
+ if not (user_phoneme == '-' and target_phoneme == '-'):
174
+ total_errors += 1
175
+
176
+ if word_is_correct:
177
+ correct_words_count += 1
178
+
179
+ words_data.append({
180
+ "word": original_words[i],
181
+ "isCorrect": word_is_correct,
182
+ "phonemes": phonemes_data
183
+ })
184
+
185
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
186
+
187
+ total_words = len(original_words)
188
+ if len(alignments) < total_words:
189
+ for i in range(len(alignments), total_words):
190
+ missed_word_ipa_str = phonemize(original_words[i], language='de', backend='espeak', strip=True).replace('ː', '')
191
+ missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
192
+ phonemes_data = []
193
+ for p_ipa in missed_word_ipa:
194
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
195
+ total_errors += 1
196
+ total_phonemes += 1
197
+
198
+ words_data.append({
199
+ "word": original_words[i],
200
+ "isCorrect": False,
201
+ "phonemes": phonemes_data
202
+ })
203
+
204
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
205
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
206
+
207
+ final_result = {
208
+ "sentence": sentence,
209
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
210
+ "summary": {
211
+ "overallScore": round(overall_score, 1),
212
+ "totalWords": total_words,
213
+ "correctWords": correct_words_count,
214
+ "phonemeErrorRate": round(phoneme_error_rate, 2),
215
+ "total_errors": total_errors,
216
+ "total_target_phonemes": total_phonemes
217
+ },
218
+ "words": words_data
219
+ }
220
+
221
+ return final_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
analyzer/ASR_en_us.py CHANGED
@@ -1,5 +1,3 @@
1
- # ASR_en_us.py
2
-
3
  import torch
4
  import soundfile as sf
5
  import librosa
@@ -9,43 +7,13 @@ from phonemizer import phonemize
9
  import numpy as np
10
  from datetime import datetime, timezone
11
 
12
- # 【【【【【 新增程式碼 #1:自動檢測可用設備 】】】】】
 
 
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
  print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
15
 
16
- # --- 1. 全域設定與模型載入函數 (保持不變) ---
17
- MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
18
-
19
- processor = None
20
- model = None
21
-
22
- def load_model():
23
- """
24
- (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
25
- 它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
26
- """
27
- global processor, model
28
- if processor and model:
29
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
30
- return True
31
-
32
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
33
- print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
34
- try:
35
- # 直接使用模型的線上名稱調用 from_pretrained
36
- # 這就是魔法發生的地方!
37
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
38
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
39
-
40
- model.to(DEVICE)
41
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
42
- return True
43
- except Exception as e:
44
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
45
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
46
-
47
- # --- 2. 智能 IPA 切分函數 (已更新) ---
48
- # 移除了包含 'ː' 的組合,因為我們將在源頭移除它
49
  MULTI_CHAR_PHONEMES = {
50
  'tʃ', 'dʒ', # 輔音 (Affricates)
51
  'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
@@ -69,18 +37,32 @@ def _tokenize_ipa(ipa_string: str) -> list:
69
  return phonemes
70
 
71
  # --- 3. 核心分析函數 (主入口) (已修改) ---
72
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
 
73
  """
74
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
75
- 這是此模組的主要進入點。
76
  """
77
- if not processor or not model:
78
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
79
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
81
 
82
- # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
83
- # 在切分前,移除所有重音和長音符號,以匹配 ASR 的輸出特性
84
  target_ipa_by_word = [
85
  _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
86
  for word in target_ipa_by_word_str
@@ -106,7 +88,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
106
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
107
 
108
 
109
- # --- 4. 對齊函數 (與上一版相同) ---
110
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
111
  """
112
  (已修改) 使用新的切分邏輯執行音素對齊。
@@ -161,7 +143,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
161
 
162
  return alignments_by_word
163
 
164
- # --- 5. 格式化函數 (與上一版相同) ---
165
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
166
  total_phonemes = 0
167
  total_errors = 0
@@ -205,7 +187,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
205
  total_words = len(original_words)
206
  if len(alignments) < total_words:
207
  for i in range(len(alignments), total_words):
208
- # 確保這裡也移除 'ː'
209
  missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
210
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
211
  phonemes_data = []
 
 
 
1
  import torch
2
  import soundfile as sf
3
  import librosa
 
7
  import numpy as np
8
  from datetime import datetime, timezone
9
 
10
+ # --- 1. 全域設定 (已修改) ---
11
+ # 移除了全域的 processor 和 model 變數,只保留常數。
12
+ MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
  print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
15
 
16
+ # --- 2. 智能 IPA 切分函數 (保持不變) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  MULTI_CHAR_PHONEMES = {
18
  'tʃ', 'dʒ', # 輔音 (Affricates)
19
  'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
 
37
  return phonemes
38
 
39
  # --- 3. 核心分析函數 (主入口) (已修改) ---
40
+ # 刪除了舊的 load_model() 函數,並將其邏輯合併至此。
41
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
42
  """
43
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
44
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
45
  """
46
+ # 檢查快取中是否已有模型,如果沒有則載入
47
+ if "model" not in cache:
48
+ print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
49
+ try:
50
+ # 載入模型並存入此函數的快取字典
51
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
52
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
53
+ cache["model"].to(DEVICE)
54
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
55
+ except Exception as e:
56
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
57
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
58
+
59
+ # 從此函數的獨立快取中獲取模型和處理器
60
+ processor = cache["processor"]
61
+ model = cache["model"]
62
+
63
+ # --- 以下為原始分析邏輯,保持不變 ---
64
  target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
65
 
 
 
66
  target_ipa_by_word = [
67
  _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
68
  for word in target_ipa_by_word_str
 
88
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
89
 
90
 
91
+ # --- 4. 對齊函數 (保持不變) ---
92
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
93
  """
94
  (已修改) 使用新的切分邏輯執行音素對齊。
 
143
 
144
  return alignments_by_word
145
 
146
+ # --- 5. 格式化函數 (保持不變) ---
147
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
148
  total_phonemes = 0
149
  total_errors = 0
 
187
  total_words = len(original_words)
188
  if len(alignments) < total_words:
189
  for i in range(len(alignments), total_words):
 
190
  missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
191
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
192
  phonemes_data = []
analyzer/ASR_en_us_v2.py CHANGED
@@ -1,277 +1,256 @@
1
- # ASR_en_us_v2.py
2
-
3
- import torch
4
- import soundfile as sf
5
- import librosa
6
- # 【【【【【 修改 #1:從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
7
- # 這是為了更好地適應 KoelLabs 模型推薦的用法,功能上與 Wav2Vec2Processor/ForCTC 相同,但更通用。
8
- from transformers import AutoProcessor, AutoModelForCTC
9
- import os
10
- from phonemizer import phonemize
11
- import numpy as np
12
- from datetime import datetime, timezone
13
-
14
- # --- 全域設定 ---
15
- # 保持不變
16
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
- # 檔案名稱已更新,您可以自行修改
18
- print(f"INFO: ASR_en_us_koel_final.py is configured to use device: {DEVICE}")
19
-
20
- # 【【【【【 修改 #2:更新為最終選定的 KoelLabs 模型名稱 】】】】】
21
- MODEL_NAME = "KoelLabs/xlsr-english-01"
22
-
23
- processor = None
24
- model = None
25
-
26
- # 【【【【【 新增程式碼 #1:為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
27
- def normalize_koel_ipa(raw_phonemes: list) -> list:
28
- """
29
- 將 KoelLabs 模型輸出的高級 IPA 序列,正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
30
- """
31
- normalized_phonemes = []
32
- for phoneme in raw_phonemes:
33
- if not phoneme: # 跳過可能的空字串
34
- continue
35
-
36
- # 1. 去掉送氣、鼻化、清音等附加符號
37
- base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
38
-
39
- # 2. 處理極少數的外來音,將其映射到最接近的英語音
40
- if base_phoneme == 'β':
41
- base_phoneme = 'v'
42
- elif base_phoneme in ['x', 'ɣ', 'ɦ']:
43
- base_phoneme = 'h'
44
- # 根據需要可以增加更多規則,但這已經涵蓋了絕大部分
45
-
46
- normalized_phonemes.append(base_phoneme)
47
-
48
- return normalized_phonemes
49
-
50
- def load_model():
51
- """
52
- 載入 KoelLabs 的 ASR 模型。
53
- """
54
- global processor, model
55
- if processor and model:
56
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
57
- return True
58
-
59
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
60
- try:
61
- # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
62
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
63
- model = AutoModelForCTC.from_pretrained(MODEL_NAME)
64
-
65
- model.to(DEVICE)
66
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
67
- return True
68
- except Exception as e:
69
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
70
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
71
-
72
- # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
73
- MULTI_CHAR_PHONEMES = {
74
- 'tʃ', '', # 輔音 (Affricates)
75
- 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
76
- 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
77
- }
78
-
79
- def _tokenize_ipa(ipa_string: str) -> list:
80
- """
81
- IPA 字串智能地切分為音素列表,能正確處理多字元音素。
82
- """
83
- phonemes = []
84
- i = 0
85
- s = ipa_string.replace(' ', '')
86
- while i < len(s):
87
- if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
88
- phonemes.append(s[i:i+2])
89
- i += 2
90
- else:
91
- phonemes.append(s[i])
92
- i += 1
93
- return phonemes
94
-
95
- # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器) ---
96
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
97
- """
98
- 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
99
- 這是此模組的主要進入點。
100
- """
101
- if not processor or not model:
102
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
103
-
104
- # 您的原始邏輯,保持不變
105
- target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
106
-
107
- # 您的原始邏輯,保持不變
108
- target_ipa_by_word = [
109
- _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
110
- for word in target_ipa_by_word_str
111
- ]
112
- target_words_original = target_sentence.split()
113
-
114
- # 您的原始邏輯,保持不變
115
- try:
116
- speech, sample_rate = sf.read(audio_file_path)
117
- if sample_rate != 16000:
118
- speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
119
- except Exception as e:
120
- raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
121
-
122
- # 您的原始邏輯,保持不變
123
- input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
124
- input_values = input_values.to(DEVICE)
125
- with torch.no_grad():
126
- logits = model(input_values).logits
127
- predicted_ids = torch.argmax(logits, dim=-1)
128
-
129
- # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
130
- # 1. 解碼得到原始的、帶有高級 IPA 的序列
131
- raw_user_ipa_str = processor.decode(predicted_ids[0])
132
- raw_user_phonemes = raw_user_ipa_str.split(' ')
133
-
134
- # 2. 調用新的正規化函式進行清理
135
- normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
136
-
137
- # 3. 將清理後的音素列表重新組合成字串,以適應後續的 _tokenize_ipa 函式
138
- user_ipa_full = "".join(normalized_user_phonemes)
139
-
140
- # 後續所有邏輯都與您的原版完全相同
141
- word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
142
-
143
- return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
144
-
145
-
146
- # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
147
- def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
148
- """
149
- (已修改) 使用新的切分邏輯執行音素對齊。
150
- """
151
- user_phonemes = _tokenize_ipa(user_phoneme_str)
152
-
153
- target_phonemes_flat = []
154
- word_boundaries_indices = []
155
- current_idx = 0
156
- for word_ipa_tokens in target_words_ipa_tokenized:
157
- target_phonemes_flat.extend(word_ipa_tokens)
158
- current_idx += len(word_ipa_tokens)
159
- word_boundaries_indices.append(current_idx - 1)
160
-
161
- dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
162
- for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
163
- for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
164
- for i in range(1, len(user_phonemes) + 1):
165
- for j in range(1, len(target_phonemes_flat) + 1):
166
- cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
167
- dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
168
-
169
- i, j = len(user_phonemes), len(target_phonemes_flat)
170
- user_path, target_path = [], []
171
- while i > 0 or j > 0:
172
- cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
173
- if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
174
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
175
- elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
176
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
177
- else:
178
- user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
179
-
180
- alignments_by_word = []
181
- word_start_idx_in_path = 0
182
- target_phoneme_counter_in_path = 0
183
-
184
- for path_idx, p in enumerate(target_path):
185
- if p != '-':
186
- if target_phoneme_counter_in_path in word_boundaries_indices:
187
- target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
188
- user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
189
-
190
- alignments_by_word.append({
191
- "target": target_alignment,
192
- "user": user_alignment
193
- })
194
-
195
- word_start_idx_in_path = path_idx + 1
196
-
197
- target_phoneme_counter_in_path += 1
198
-
199
- return alignments_by_word
200
-
201
- # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
202
- def _format_to_json_structure(alignments, sentence, original_words) -> dict:
203
- total_phonemes = 0
204
- total_errors = 0
205
- correct_words_count = 0
206
- words_data = []
207
-
208
- num_words_to_process = min(len(alignments), len(original_words))
209
-
210
- for i in range(num_words_to_process):
211
- alignment = alignments[i]
212
- word_is_correct = True
213
- phonemes_data = []
214
-
215
- for j in range(len(alignment['target'])):
216
- target_phoneme = alignment['target'][j]
217
- user_phoneme = alignment['user'][j]
218
- is_match = (user_phoneme == target_phoneme)
219
-
220
- phonemes_data.append({
221
- "target": target_phoneme,
222
- "user": user_phoneme,
223
- "isMatch": is_match
224
- })
225
-
226
- if not is_match:
227
- word_is_correct = False
228
- if not (user_phoneme == '-' and target_phoneme == '-'):
229
- total_errors += 1
230
-
231
- if word_is_correct:
232
- correct_words_count += 1
233
-
234
- words_data.append({
235
- "word": original_words[i],
236
- "isCorrect": word_is_correct,
237
- "phonemes": phonemes_data
238
- })
239
-
240
- total_phonemes += sum(1 for p in alignment['target'] if p != '-')
241
-
242
- total_words = len(original_words)
243
- if len(alignments) < total_words:
244
- for i in range(len(alignments), total_words):
245
- # 您的原始邏輯,保持不變
246
- missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
247
- missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
248
- phonemes_data = []
249
- for p_ipa in missed_word_ipa:
250
- phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
251
- total_errors += 1
252
- total_phonemes += 1
253
-
254
- words_data.append({
255
- "word": original_words[i],
256
- "isCorrect": False,
257
- "phonemes": phonemes_data
258
- })
259
-
260
- overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
261
- phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
262
-
263
- final_result = {
264
- "sentence": sentence,
265
- "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
266
- "summary": {
267
- "overallScore": round(overall_score, 1),
268
- "totalWords": total_words,
269
- "correctWords": correct_words_count,
270
- "phonemeErrorRate": round(phoneme_error_rate, 2),
271
- "total_errors": total_errors,
272
- "total_target_phonemes": total_phonemes
273
- },
274
- "words": words_data
275
- }
276
-
277
- return final_result
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ # 【【【【【 修改 #1:從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
+ import os
7
+ from phonemizer import phonemize
8
+ import numpy as np
9
+ from datetime import datetime, timezone
10
+
11
+ # --- 全域設定 (已修改) ---
12
+ # 移除了全域的 processor model 變數。
13
+ # 刪除了舊的 load_model() 函數。
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
16
+
17
+ # 【【【【【 修改 #2:更新為最終選定的 KoelLabs 模型名稱 】】】】】
18
+ MODEL_NAME = "KoelLabs/xlsr-english-01"
19
+
20
+ # 【【【【【 新增程式碼 #1:為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
21
+ # 【保持不變】
22
+ def normalize_koel_ipa(raw_phonemes: list) -> list:
23
+ """
24
+ KoelLabs 模型輸出的高級 IPA 序列,正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
25
+ """
26
+ normalized_phonemes = []
27
+ for phoneme in raw_phonemes:
28
+ if not phoneme:
29
+ continue
30
+
31
+ base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
32
+
33
+ if base_phoneme == 'β':
34
+ base_phoneme = 'v'
35
+ elif base_phoneme in ['x', 'ɣ', 'ɦ']:
36
+ base_phoneme = 'h'
37
+
38
+ normalized_phonemes.append(base_phoneme)
39
+
40
+ return normalized_phonemes
41
+
42
+ # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
43
+ # 【保持不變】
44
+ MULTI_CHAR_PHONEMES = {
45
+ 'tʃ', 'dʒ',
46
+ 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
47
+ 'ɪə', 'eə', 'ʊə', 'ər'
48
+ }
49
+
50
+ def _tokenize_ipa(ipa_string: str) -> list:
51
+ """
52
+ IPA 字串智能地切分為音素列表,能正確處理多字元音素。
53
+ """
54
+ phonemes = []
55
+ i = 0
56
+ s = ipa_string.replace(' ', '')
57
+ while i < len(s):
58
+ if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
59
+ phonemes.append(s[i:i+2])
60
+ i += 2
61
+ else:
62
+ phonemes.append(s[i])
63
+ i += 1
64
+ return phonemes
65
+
66
+ # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
67
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
68
+ """
69
+ 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
70
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
71
+ """
72
+ # 檢查快取中是否已有模型,如果沒有則載入
73
+ if "model" not in cache:
74
+ print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
75
+ try:
76
+ # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
77
+ # 載入模型並存入此函數的快取字典
78
+ cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
79
+ cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
80
+ cache["model"].to(DEVICE)
81
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
82
+ except Exception as e:
83
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
84
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
85
+
86
+ # 從此函數的獨立快取中獲取模型和處理器
87
+ processor = cache["processor"]
88
+ model = cache["model"]
89
+
90
+ # --- 以下為原始分析邏輯,保持不變 ---
91
+ target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
92
+
93
+ target_ipa_by_word = [
94
+ _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
95
+ for word in target_ipa_by_word_str
96
+ ]
97
+ target_words_original = target_sentence.split()
98
+
99
+ try:
100
+ speech, sample_rate = sf.read(audio_file_path)
101
+ if sample_rate != 16000:
102
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
103
+ except Exception as e:
104
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
105
+
106
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
107
+ input_values = input_values.to(DEVICE)
108
+ with torch.no_grad():
109
+ logits = model(input_values).logits
110
+ predicted_ids = torch.argmax(logits, dim=-1)
111
+
112
+ # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
113
+ # 【保持不變】
114
+ raw_user_ipa_str = processor.decode(predicted_ids[0])
115
+ raw_user_phonemes = raw_user_ipa_str.split(' ')
116
+ normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
117
+ user_ipa_full = "".join(normalized_user_phonemes)
118
+
119
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
120
+
121
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
122
+
123
+
124
+ # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
125
+ # 【保持不變】
126
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
127
+ """
128
+ (已修改) 使用新的切分邏輯執行音素對齊。
129
+ """
130
+ user_phonemes = _tokenize_ipa(user_phoneme_str)
131
+
132
+ target_phonemes_flat = []
133
+ word_boundaries_indices = []
134
+ current_idx = 0
135
+ for word_ipa_tokens in target_words_ipa_tokenized:
136
+ target_phonemes_flat.extend(word_ipa_tokens)
137
+ current_idx += len(word_ipa_tokens)
138
+ word_boundaries_indices.append(current_idx - 1)
139
+
140
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
141
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
142
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
143
+ for i in range(1, len(user_phonemes) + 1):
144
+ for j in range(1, len(target_phonemes_flat) + 1):
145
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
146
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
147
+
148
+ i, j = len(user_phonemes), len(target_phonemes_flat)
149
+ user_path, target_path = [], []
150
+ while i > 0 or j > 0:
151
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
152
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
153
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
154
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
155
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
156
+ else:
157
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
158
+
159
+ alignments_by_word = []
160
+ word_start_idx_in_path = 0
161
+ target_phoneme_counter_in_path = 0
162
+
163
+ for path_idx, p in enumerate(target_path):
164
+ if p != '-':
165
+ if target_phoneme_counter_in_path in word_boundaries_indices:
166
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
167
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
168
+
169
+ alignments_by_word.append({
170
+ "target": target_alignment,
171
+ "user": user_alignment
172
+ })
173
+
174
+ word_start_idx_in_path = path_idx + 1
175
+
176
+ target_phoneme_counter_in_path += 1
177
+
178
+ return alignments_by_word
179
+
180
+ # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
181
+ # 【保持不變】
182
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
183
+ total_phonemes = 0
184
+ total_errors = 0
185
+ correct_words_count = 0
186
+ words_data = []
187
+
188
+ num_words_to_process = min(len(alignments), len(original_words))
189
+
190
+ for i in range(num_words_to_process):
191
+ alignment = alignments[i]
192
+ word_is_correct = True
193
+ phonemes_data = []
194
+
195
+ for j in range(len(alignment['target'])):
196
+ target_phoneme = alignment['target'][j]
197
+ user_phoneme = alignment['user'][j]
198
+ is_match = (user_phoneme == target_phoneme)
199
+
200
+ phonemes_data.append({
201
+ "target": target_phoneme,
202
+ "user": user_phoneme,
203
+ "isMatch": is_match
204
+ })
205
+
206
+ if not is_match:
207
+ word_is_correct = False
208
+ if not (user_phoneme == '-' and target_phoneme == '-'):
209
+ total_errors += 1
210
+
211
+ if word_is_correct:
212
+ correct_words_count += 1
213
+
214
+ words_data.append({
215
+ "word": original_words[i],
216
+ "isCorrect": word_is_correct,
217
+ "phonemes": phonemes_data
218
+ })
219
+
220
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
221
+
222
+ total_words = len(original_words)
223
+ if len(alignments) < total_words:
224
+ for i in range(len(alignments), total_words):
225
+ missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
226
+ missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
227
+ phonemes_data = []
228
+ for p_ipa in missed_word_ipa:
229
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
230
+ total_errors += 1
231
+ total_phonemes += 1
232
+
233
+ words_data.append({
234
+ "word": original_words[i],
235
+ "isCorrect": False,
236
+ "phonemes": phonemes_data
237
+ })
238
+
239
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
240
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
241
+
242
+ final_result = {
243
+ "sentence": sentence,
244
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
245
+ "summary": {
246
+ "overallScore": round(overall_score, 1),
247
+ "totalWords": total_words,
248
+ "correctWords": correct_words_count,
249
+ "phonemeErrorRate": round(phoneme_error_rate, 2),
250
+ "total_errors": total_errors,
251
+ "total_target_phonemes": total_phonemes
252
+ },
253
+ "words": words_data
254
+ }
255
+
256
+ return final_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
analyzer/ASR_en_us_v3.py DELETED
@@ -1,320 +0,0 @@
1
- # ASR_en_us_v3.py
2
-
3
- import torch
4
- import soundfile as sf
5
- import librosa
6
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
- import os
8
- from phonemizer import phonemize
9
- import numpy as np
10
- from datetime import datetime, timezone
11
-
12
- # --- 全域設定 ---
13
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
- print(f"INFO: ASR_en_us_v3.py is configured to use device: {DEVICE}")
15
-
16
- # 【【【【【 關鍵修改 #1:更新為最終選定的模型名稱 】】】】】
17
- MODEL_NAME = "facebook/wav2vec2-lv-60-espeak-cv-ft"
18
-
19
- processor = None
20
- model = None
21
-
22
- # 【【【【【 新增程式碼 #1:IPA 淨化器相關的字典 】】】】】
23
-
24
- # 步驟 1a:定義一個權威的、我們認可的「標準美式英語 IPA 符號集」
25
- # 這個集合是我們的「白名單」
26
- VALID_ENGLISH_IPA = {
27
- # 元音 (Vowels)
28
- 'i', 'ɪ', 'e', 'ɛ', 'æ', 'a', 'ɑ', 'ɔ', 'o', 'ʊ', 'u', 'ʌ', 'ə', 'ɐ', 'ᵻ',
29
- # R音化元音 (R-colored Vowels)
30
- 'ɚ', 'ɝ',
31
- # 雙元音 (Diphthongs)
32
- 'aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ', 'iə', 'eə', 'ʊə', 'ɛɹ', 'ɪɹ', 'ʊɹ', 'aɪɚ', 'aɪə',
33
- # 輔音 (Consonants)
34
- 'p', 'b', 't', 'd', 'k', 'ɡ', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'm', 'n', 'ŋ', 'l', 'ɹ', 'w', 'j',
35
- # 塞擦音 (Affricates)
36
- 'tʃ', 'dʒ',
37
- # 其他常見變體
38
- 'ɾ', 'ʔ', 'ɫ', 'n̩', 'l̩', 'r̩'
39
- }
40
-
41
- # 步驟 1b:建立「外語到英語」的映射規則字典
42
- # 這是我們的「重點觀察名單」或「黑名單轉換規則」
43
- NON_ENGLISH_TO_ENGLISH_MAP = {
44
- # 歐洲語言常見變體
45
- 'ʁ': 'ɹ', 'r': 'ɹ', 'β': 'v', 'x': 'h', 'ɣ': 'ɡ', 'ç': 'h', 'y': 'i', 'ø': 'e', 'œ': 'ɛ', 'ɒ': 'ɑ', 'əʊ': 'oʊ',
46
- # 鼻化元音 (去掉鼻化)
47
- 'ɑ̃': 'ɑ', 'ɔ̃': 'ɔ', 'ɛ̃': 'ɛ', 'œ̃': 'ɛ', 'ɐ̃': 'ɐ', 'õ': 'o', 'ĩ': 'i', 'ũ': 'u',
48
- # 亞洲/斯拉夫語系常見音 (映射到最接近的英語音)
49
- 'ɕ': 'ʃ', 'tɕ': 'tʃ', 'ʂ': 'ʃ', 'ʐ': 'ʒ', 'dʑ': 'dʒ',
50
- # 印地語捲舌音 (去掉捲舌特徵)
51
- 'ʈ': 't', 'ɖ': 'd', 'ɳ': 'n', 'ɭ': 'l', 'ɽ': 'ɾ',
52
- # 阿拉伯語系音
53
- 'ʕ': 'ʔ', 'ħ': 'h', 'q': 'k',
54
- # 其他...
55
- 'ʎ': 'j', 'ɲ': 'n', 'ʋ': 'v', 'c': 'k', 'ɟ': 'ɡ', 'ɸ': 'f', 'χ': 'h',
56
- }
57
-
58
- def load_model():
59
- """
60
- 載入 Facebook 的 Wav2Vec2 espeak ASR 模型。
61
- """
62
- global processor, model
63
- if processor and model:
64
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
65
- return True
66
-
67
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
68
- try:
69
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
70
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
71
-
72
- model.to(DEVICE)
73
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
74
- return True
75
- except Exception as e:
76
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
77
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
78
-
79
- # 【【【【【 新增程式碼 #2:IPA 淨化器函式 】】】】】
80
- def purify_ipa_sequence(raw_phonemes: list) -> list:
81
- """
82
- 將一個可能包含外語 IPA 的音素序列,淨化為只包含標準英語 IPA 的序列。
83
- """
84
- purified_phonemes = []
85
- for phoneme in raw_phonemes:
86
- if not phoneme: # 跳過空字串
87
- continue
88
-
89
- # 1. 如果音素本身就是合法的英語 IPA,直接接受
90
- if phoneme in VALID_ENGLISH_IPA:
91
- purified_phonemes.append(phoneme)
92
- continue
93
-
94
- # 2. 如果音素在我們的映射字典中,進行替換
95
- if phoneme in NON_ENGLISH_TO_ENGLISH_MAP:
96
- replacement = NON_ENGLISH_TO_ENGLISH_MAP[phoneme]
97
- purified_phonemes.append(replacement)
98
- # print(f"INFO: Replaced non-English IPA '{phoneme}' with '{replacement}'.") # 可選的日誌
99
- continue
100
-
101
- # 3. 處理帶有附加符號的音素 (例如長音 'ː', 顎化 'ʲ')
102
- # 簡化處理:直接去掉附加符號,看剩下的部分是否合法
103
- base_phoneme = phoneme.replace('ː', '').replace('ʲ', '').replace('ʰ', '')
104
- if base_phoneme in VALID_ENGLISH_IPA:
105
- purified_phonemes.append(base_phoneme)
106
- # print(f"INFO: Stripped diacritics from '{phoneme}' to '{base_phoneme}'.") # 可選的日誌
107
- continue
108
-
109
- # 4. 如果經過以上所有步驟仍然無法識別,作為最後手段,忽略該音素
110
- # print(f"WARNING: Unknown IPA phoneme '{phoneme}' encountered and was ignored.") # 可選的日誌
111
-
112
- return purified_phonemes
113
-
114
- # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
115
- MULTI_CHAR_PHONEMES = {
116
- 'tʃ', 'dʒ', 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', 'ɪə', 'eə', 'ʊə', 'ər',
117
- # 為 Facebook 模型輸出新增的組合
118
- 'ɑː', 'iː', 'uː', 'ɔː', 'ɜː', 'oː', 'eː', 'yː', 'øː', 'œː', 'ɛː', 'æː',
119
- 'ɑːɹ', 'ɔːɹ', 'oːɹ', 'ɛɹ', 'ɪɹ', 'ʊɹ', 'aɪɚ', 'aɪə'
120
- }
121
-
122
- def _tokenize_ipa(ipa_string: str) -> list:
123
- """
124
- 將 IPA 字串智能地切分為音素列表,能正確處理多字元音素。
125
- """
126
- phonemes = []
127
- i = 0
128
- s = ipa_string.replace(' ', '')
129
- while i < len(s):
130
- # 優先檢查三個字符的組合 (例如 ɑːɹ)
131
- if i + 2 < len(s) and s[i:i+3] in MULTI_CHAR_PHONEMES:
132
- phonemes.append(s[i:i+3])
133
- i += 3
134
- # 再檢查兩個字符的組合
135
- elif i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
136
- phonemes.append(s[i:i+2])
137
- i += 2
138
- else:
139
- phonemes.append(s[i])
140
- i += 1
141
- return phonemes
142
-
143
- # --- 3. 核心分析函數 (主入口) (已修改以整合淨化器) ---
144
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
145
- """
146
- 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
147
- """
148
- if not processor or not model:
149
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
150
-
151
- # 步驟 1:獲取目標 IPA (與原版邏輯相同)
152
- target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
153
-
154
- # 【【【【【 關鍵修改 #2:完全遵循您對目標 IPA 的清理邏輯 】】】】】
155
- # 在切分前,移除所有重音和長音符號
156
- target_ipa_by_word = [
157
- _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
158
- for word in target_ipa_by_word_str
159
- ]
160
- target_words_original = target_sentence.split()
161
-
162
- # 步驟 2:讀取和重採樣音訊 (與原版邏輯相同)
163
- try:
164
- speech, sample_rate = sf.read(audio_file_path)
165
- if sample_rate != 16000:
166
- speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
167
- except Exception as e:
168
- raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
169
-
170
- # 步驟 3:使用 Wav2Vec2 模型進行預測
171
- input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
172
- input_values = input_values.to(DEVICE)
173
- with torch.no_grad():
174
- logits = model(input_values).logits
175
- predicted_ids = torch.argmax(logits, dim=-1)
176
-
177
- # 步驟 4:解碼得到原始的、可能混雜的音素序列
178
- raw_user_ipa_str = processor.batch_decode(predicted_ids[0])[0]
179
- raw_user_phonemes = raw_user_ipa_str.split(' ')
180
-
181
- # 【【【【【 關鍵修改 #3:在此處插入淨化步驟 】】】】】
182
- purified_user_phonemes = purify_ipa_sequence(raw_user_phonemes)
183
- user_ipa_full = "".join(purified_user_phonemes)
184
-
185
- # 步驟 5:使用淨化後的 IPA 進行音素對齊 (後續邏輯與原版完全相同)
186
- word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
187
-
188
- # 步驟 6:格式化為最終的 JSON 結構 (與原版邏輯完全相同)
189
- return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
190
-
191
-
192
- # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
193
- def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
194
- user_phonemes = _tokenize_ipa(user_phoneme_str)
195
-
196
- target_phonemes_flat = []
197
- word_boundaries_indices = []
198
- current_idx = 0
199
- for word_ipa_tokens in target_words_ipa_tokenized:
200
- target_phonemes_flat.extend(word_ipa_tokens)
201
- current_idx += len(word_ipa_tokens)
202
- word_boundaries_indices.append(current_idx - 1)
203
-
204
- dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
205
- for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
206
- for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
207
- for i in range(1, len(user_phonemes) + 1):
208
- for j in range(1, len(target_phonemes_flat) + 1):
209
- cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
210
- dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
211
-
212
- i, j = len(user_phonemes), len(target_phonemes_flat)
213
- user_path, target_path = [], []
214
- while i > 0 or j > 0:
215
- cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
216
- if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
217
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
218
- elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
219
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
220
- else:
221
- user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
222
-
223
- alignments_by_word = []
224
- word_start_idx_in_path = 0
225
- target_phoneme_counter_in_path = 0
226
-
227
- for path_idx, p in enumerate(target_path):
228
- if p != '-':
229
- if target_phoneme_counter_in_path in word_boundaries_indices:
230
- target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
231
- user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
232
-
233
- alignments_by_word.append({
234
- "target": target_alignment,
235
- "user": user_alignment
236
- })
237
-
238
- word_start_idx_in_path = path_idx + 1
239
-
240
- target_phoneme_counter_in_path += 1
241
-
242
- return alignments_by_word
243
-
244
- # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
245
- def _format_to_json_structure(alignments, sentence, original_words) -> dict:
246
- total_phonemes = 0
247
- total_errors = 0
248
- correct_words_count = 0
249
- words_data = []
250
-
251
- num_words_to_process = min(len(alignments), len(original_words))
252
-
253
- for i in range(num_words_to_process):
254
- alignment = alignments[i]
255
- word_is_correct = True
256
- phonemes_data = []
257
-
258
- for j in range(len(alignment['target'])):
259
- target_phoneme = alignment['target'][j]
260
- user_phoneme = alignment['user'][j]
261
- is_match = (user_phoneme == target_phoneme)
262
-
263
- phonemes_data.append({
264
- "target": target_phoneme,
265
- "user": user_phoneme,
266
- "isMatch": is_match
267
- })
268
-
269
- if not is_match:
270
- word_is_correct = False
271
- if not (user_phoneme == '-' and target_phoneme == '-'):
272
- total_errors += 1
273
-
274
- if word_is_correct:
275
- correct_words_count += 1
276
-
277
- words_data.append({
278
- "word": original_words[i],
279
- "isCorrect": word_is_correct,
280
- "phonemes": phonemes_data
281
- })
282
-
283
- total_phonemes += sum(1 for p in alignment['target'] if p != '-')
284
-
285
- total_words = len(original_words)
286
- if len(alignments) < total_words:
287
- for i in range(len(alignments), total_words):
288
- # 【【【【【 關鍵修改 #4:完全遵循您對遺漏單詞的清理邏輯 】】】】】
289
- missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
290
- missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
291
- phonemes_data = []
292
- for p_ipa in missed_word_ipa:
293
- phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
294
- total_errors += 1
295
- total_phonemes += 1
296
-
297
- words_data.append({
298
- "word": original_words[i],
299
- "isCorrect": False,
300
- "phonemes": phonemes_data
301
- })
302
-
303
- overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
304
- phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
305
-
306
- final_result = {
307
- "sentence": sentence,
308
- "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
309
- "summary": {
310
- "overallScore": round(overall_score, 1),
311
- "totalWords": total_words,
312
- "correctWords": correct_words_count,
313
- "phonemeErrorRate": round(phoneme_error_rate, 2),
314
- "total_errors": total_errors,
315
- "total_target_phonemes": total_phonemes
316
- },
317
- "words": words_data
318
- }
319
-
320
- return final_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
analyzer/ASR_fr_fr.py CHANGED
@@ -1,5 +1,3 @@
1
- # ASR_fr_fr.py
2
-
3
  import torch
4
  import soundfile as sf
5
  import librosa
@@ -17,86 +15,66 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
  print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
18
 
19
  # --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
 
 
20
  MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
21
 
22
- processor = None
23
- model = None
24
-
25
- def load_model():
26
- """
27
- (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
28
- 它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
29
- """
30
- global processor, model
31
- if processor and model:
32
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
33
- return True
34
-
35
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
36
- print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
37
- try:
38
- # 直接使用模型的線上名稱調用 from_pretrained
39
- # 這就是魔法發生的地方!
40
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
41
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
42
-
43
- model.to(DEVICE)
44
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
45
- return True
46
- except Exception as e:
47
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
48
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
49
-
50
  def _tokenize_unicode_ipa(ipa_string: str) -> list:
51
  """
52
  智能地切分包含 Unicode 組合字元的 IPA 字串。
53
  """
54
  phonemes = []
55
- # 移除所有空格
56
  s = ipa_string.replace(' ', '')
57
 
58
  i = 0
59
  while i < len(s):
60
- # 獲取當前字元
61
  current_char = s[i]
62
  i += 1
63
- # 檢查後續是否有連續的組合字元
64
- while i < len(s) and unicodedata.category(s[i]) == 'Mn': # 'Mn' 代表非間距標記 (Non-Spacing Mark)
65
  current_char += s[i]
66
  i += 1
67
  phonemes.append(current_char)
68
  return phonemes
69
 
70
  # --- 2. 核心分析函數 (主入口) (已修改為法語邏輯) ---
71
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
 
72
  """
73
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
74
- 這是此模組的主要進入點。
75
  """
76
- if not processor or not model:
77
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
78
-
79
- # 【【【【【 關鍵修改 1:更智能地處理原始句子 】】】】】
80
- # 使用正則表達式來準確地分割單詞,並自動忽略標點符號
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
82
- # 將分割好的、乾淨的單詞重新組合,再傳給 phonemize
83
  cleaned_sentence = " ".join(target_words_original)
84
 
85
- # 使用 espeak 獲取法語目標音素
86
  epi_fr = epitran.Epitran('fra-Latn')
87
  target_ipa_full = epi_fr.transliterate(cleaned_sentence)
88
  target_ipa_by_word_str = target_ipa_full.split()
89
 
90
- # 【【【【【 確保兩個列表長度一致 】】】】】
91
  if len(target_ipa_by_word_str) != len(target_words_original):
92
  target_words_original = target_words_original[:len(target_ipa_by_word_str)]
93
 
94
- # 對於法語,我們將特殊符號移除,並使用簡單的字元切分
95
  target_ipa_by_word = [
96
  _tokenize_unicode_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('‿', '').replace("'", ""))
97
  for word in target_ipa_by_word_str
98
  ]
99
- # target_words_original 已經在上面被正確賦值了
100
 
101
  try:
102
  speech, sample_rate = sf.read(audio_file_path)
@@ -122,7 +100,6 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
122
  """
123
  執行音素對齊���對法語使用簡單的字元切分。
124
  """
125
- # 對於 user 的音素字串,也使用簡單的字元切分
126
  user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
127
 
128
  target_phonemes_flat = []
@@ -217,7 +194,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
217
  total_words = len(original_words)
218
  if len(alignments) < total_words:
219
  for i in range(len(alignments), total_words):
220
- # 確保這裡也移除相關符號
221
  missed_word_ipa_str = phonemize(original_words[i], language='fr-fr', backend='espeak', strip=True).replace('ˈ', '').replace('ˌ', '').replace('‿', '')
222
  missed_word_ipa = _tokenize_unicode_ipa(missed_word_ipa_str)
223
  phonemes_data = []
@@ -249,4 +225,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
249
  "words": words_data
250
  }
251
 
252
- return final_result
 
 
 
1
  import torch
2
  import soundfile as sf
3
  import librosa
 
15
  print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
16
 
17
  # --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
18
+ # 移除了全域的 processor 和 model 變數,只保留常數。
19
+ # 刪除了舊的 load_model() 函數。
20
  MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def _tokenize_unicode_ipa(ipa_string: str) -> list:
23
  """
24
  智能地切分包含 Unicode 組合字元的 IPA 字串。
25
  """
26
  phonemes = []
 
27
  s = ipa_string.replace(' ', '')
28
 
29
  i = 0
30
  while i < len(s):
 
31
  current_char = s[i]
32
  i += 1
33
+ while i < len(s) and unicodedata.category(s[i]) == 'Mn':
 
34
  current_char += s[i]
35
  i += 1
36
  phonemes.append(current_char)
37
  return phonemes
38
 
39
  # --- 2. 核心分析函數 (主入口) (已修改為法語邏輯) ---
40
+ # 將模型載入和快取邏輯合併至此。
41
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
42
  """
43
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
44
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
45
  """
46
+ # 檢查快取中是否已有模型,如果沒有則載入
47
+ if "model" not in cache:
48
+ print(f"快取未命中 (ASR_fr_fr)。正在載入模型 '{MODEL_NAME}'...")
49
+ try:
50
+ # 載入模型並存入此函數的快取字典
51
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
52
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
53
+ cache["model"].to(DEVICE)
54
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
55
+ except Exception as e:
56
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
57
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
58
+
59
+ # 從此函數的獨立快取中獲取模型和處理器
60
+ processor = cache["processor"]
61
+ model = cache["model"]
62
+
63
+ # --- 以下為原始分析邏輯,保持不變 ---
64
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
 
65
  cleaned_sentence = " ".join(target_words_original)
66
 
 
67
  epi_fr = epitran.Epitran('fra-Latn')
68
  target_ipa_full = epi_fr.transliterate(cleaned_sentence)
69
  target_ipa_by_word_str = target_ipa_full.split()
70
 
 
71
  if len(target_ipa_by_word_str) != len(target_words_original):
72
  target_words_original = target_words_original[:len(target_ipa_by_word_str)]
73
 
 
74
  target_ipa_by_word = [
75
  _tokenize_unicode_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('‿', '').replace("'", ""))
76
  for word in target_ipa_by_word_str
77
  ]
 
78
 
79
  try:
80
  speech, sample_rate = sf.read(audio_file_path)
 
100
  """
101
  執行音素對齊���對法語使用簡單的字元切分。
102
  """
 
103
  user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
104
 
105
  target_phonemes_flat = []
 
194
  total_words = len(original_words)
195
  if len(alignments) < total_words:
196
  for i in range(len(alignments), total_words):
 
197
  missed_word_ipa_str = phonemize(original_words[i], language='fr-fr', backend='espeak', strip=True).replace('ˈ', '').replace('ˌ', '').replace('‿', '')
198
  missed_word_ipa = _tokenize_unicode_ipa(missed_word_ipa_str)
199
  phonemes_data = []
 
225
  "words": words_data
226
  }
227
 
228
+ return final_result
analyzer/ASR_jp_jp.py CHANGED
@@ -1,5 +1,3 @@
1
- # ASR_jp_jp.py
2
-
3
  # =======================================================================
4
  # 1. 匯入區 (Imports)
5
  # - 新增了 pyopenjtalk 和 MeCab
@@ -17,6 +15,7 @@ import re
17
 
18
  # =======================================================================
19
  # 2. 全域變數與配置區 (Global Variables & Config)
 
20
  # =======================================================================
21
  # 自動檢測可用設備
22
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -25,9 +24,6 @@ print(f"INFO: ASR_jp_jp.py is configured to use device: {DEVICE}")
25
  # 設定為日語 ASR 模型
26
  MODEL_NAME = "prj-beatrice/japanese-hubert-base-phoneme-ctc-v3"
27
 
28
- processor = None
29
- model = None
30
-
31
  # 初始化 MeCab 分詞器
32
  # -Owakati 選項能直接輸出以空格分隔的單詞,非常方便
33
  try:
@@ -42,30 +38,12 @@ except RuntimeError:
42
 
43
  # -----------------------------------------------------------------------
44
  # 3.1. 模型載入函數
45
- # - Wav2Vec2ForCTC 更換為 HubertForCTC
46
  # -----------------------------------------------------------------------
47
- def load_model():
48
- """
49
- 載入日語 ASR 模型 (HubertForCTC) 和對應的處理器。
50
- """
51
- global processor, model
52
- if processor and model:
53
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
54
- return True
55
-
56
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
57
- try:
58
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
59
- model = HubertForCTC.from_pretrained(MODEL_NAME) # <-- 使用 HubertForCTC
60
- model.to(DEVICE)
61
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
62
- return True
63
- except Exception as e:
64
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
65
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
66
 
67
  # -----------------------------------------------------------------------
68
  # 3.2. 日語 G2P 輔助函數 (此檔案最核心的修改)
 
69
  # -----------------------------------------------------------------------
70
  def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
71
  if not mecab_tagger:
@@ -82,8 +60,6 @@ def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]
82
 
83
  phonemes_str = pyopenjtalk.g2p(word, kana=False)
84
 
85
- # 【最終修正】完全不清理任何音素,直接使用原始輸出
86
- # 只做基本的空格標準化
87
  cleaned_phonemes = re.sub(r'\s+', ' ', phonemes_str).strip()
88
 
89
  phoneme_list = cleaned_phonemes.split()
@@ -96,6 +72,7 @@ def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]
96
 
97
  # -----------------------------------------------------------------------
98
  # 3.3. 音素切分函數 (用於處理 ASR 的輸出)
 
99
  # -----------------------------------------------------------------------
100
  def _tokenize_asr_output(phoneme_string: str) -> list:
101
  """
@@ -106,26 +83,40 @@ def _tokenize_asr_output(phoneme_string: str) -> list:
106
 
107
  # -----------------------------------------------------------------------
108
  # 3.4. 核心分析函數 (主入口)
 
109
  # -----------------------------------------------------------------------
110
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
111
  """
112
  接收音訊檔案路徑和目標日語句子,回傳詳細的發音分析字典。
 
113
  """
114
- if not processor or not model:
115
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # 【關鍵步驟 1: G2P】
118
- # 使用新的 G2P 函數獲取目標單詞和音素
119
  target_words_original, target_ipa_by_word = _get_target_phonemes_by_word(target_sentence)
120
 
121
- # 處理音訊檔案為空或句子為空的邊界情況
122
  if not target_words_original:
123
  print("警告: G2P 處理後目標句子為空。")
124
- # 建立一個空的骨架結構返回
125
  return _format_to_json_structure([], target_sentence, [])
126
 
127
  # 【關鍵步驟 2: ASR】
128
- # 載入並處理音訊
129
  try:
130
  speech, sample_rate = sf.read(audio_file_path)
131
  if len(speech) == 0:
@@ -135,7 +126,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
135
  if sample_rate != 16000:
136
  speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
137
 
138
- # 進行 ASR 推論
139
  input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
140
  input_values = input_values.to(DEVICE)
141
  with torch.no_grad():
@@ -147,50 +137,34 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
147
  raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
148
 
149
  # 【關鍵步驟 3: 對齊】
150
- # 執行音素對齊
151
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
152
 
153
  # 【關鍵步驟 4: 格式化】
154
- # 格式化為最終的 JSON 輸出
155
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
156
 
157
  # =======================================================================
158
  # 4. 對齊與格式化函數區 (Alignment & Formatting)
159
- # 【注意】這些函數是語言無關的,直接從 en_us/fr_fr 版本複製而來。
160
  # =======================================================================
161
 
162
  # -----------------------------------------------------------------------
163
  # 4.1. 對齊函數 (語言無關)
164
  # -----------------------------------------------------------------------
165
- # 【【【【【 最終的、決定性的日文版邏輯修正 】】】】】
166
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
167
  """
168
  使用動態規劃執行音素對齊。此函數是語言無關的。
169
  """
170
- # 【【【【【 關鍵修改 】】】】】
171
- # 舊的錯誤做法:user_phonemes = user_phoneme_str.split()
172
- # 這只會得到 ['a', 'sh', 'i', 't', 'a'] 這樣的列表。
173
-
174
- # 新的正確做法:
175
- # 1. 先按空格分割成 "音素單詞"。
176
- # 2. 再將每個 "音素單詞" 徹底地展開成單個音素字元。
177
- # 例如,"a sh i t a" -> ['a', 'sh', 'i', 't', 'a'] -> ['a', 's', 'h', 'i', 't', 'a']
178
- # 這與英文版的 _tokenize_ipa() 達成了相同的效果:在對齊前就切分到最小單元。
179
  user_phonemes = [char for word in user_phoneme_str.split() for char in word]
180
 
181
- # --- 後續的對齊邏輯完全保持不變 ---
182
-
183
  target_phonemes_flat = []
184
  word_boundaries_indices = []
185
  current_idx = 0
186
  for word_ipa_tokens in target_words_ipa_tokenized:
187
- # 對於 target,我們也需要確保它是最小單元
188
  flat_tokens = [char for word in word_ipa_tokens for char in word]
189
  target_phonemes_flat.extend(flat_tokens)
190
  current_idx += len(flat_tokens)
191
  word_boundaries_indices.append(current_idx - 1)
192
 
193
- # 如果目標音素為空,返回空對齊
194
  if not target_phonemes_flat:
195
  return []
196
 
@@ -261,7 +235,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
261
  word_is_correct = True
262
  phonemes_data = []
263
 
264
- # 確保 alignment['target'] 和 alignment['user'] 長度相同
265
  min_len = min(len(alignment['target']), len(alignment['user']))
266
  for j in range(min_len):
267
  target_phoneme = alignment['target'][j]
@@ -276,7 +249,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
276
 
277
  if not is_match:
278
  word_is_correct = False
279
- # 只有在 target 和 user 不都為 '-' 時才算作錯誤
280
  if not (user_phoneme == '-' and target_phoneme == '-'):
281
  total_errors += 1
282
 
@@ -291,14 +263,12 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
291
 
292
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
293
 
294
- # 【Fuse Logic】處理 ASR 結果比目標單詞少的情況 (使用者漏講了單詞)
295
  if len(alignments) < len(original_words):
296
  for i in range(len(alignments), len(original_words)):
297
- # 重新獲取漏掉單詞的音素
298
  _, missed_word_ipa_list = _get_target_phonemes_by_word(original_words[i])
299
 
300
  phonemes_data = []
301
- if missed_word_ipa_list: # 確保列表不是空的
302
  for p_ipa in missed_word_ipa_list[0]:
303
  phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
304
  total_errors += 1
 
 
 
1
  # =======================================================================
2
  # 1. 匯入區 (Imports)
3
  # - 新增了 pyopenjtalk 和 MeCab
 
15
 
16
  # =======================================================================
17
  # 2. 全域變數與配置區 (Global Variables & Config)
18
+ # 【已修改】移除了全域的 processor 和 model 變數。
19
  # =======================================================================
20
  # 自動檢測可用設備
21
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
24
  # 設定為日語 ASR 模型
25
  MODEL_NAME = "prj-beatrice/japanese-hubert-base-phoneme-ctc-v3"
26
 
 
 
 
27
  # 初始化 MeCab 分詞器
28
  # -Owakati 選項能直接輸出以空格分隔的單詞,非常方便
29
  try:
 
38
 
39
  # -----------------------------------------------------------------------
40
  # 3.1. 模型載入函數
41
+ # 【已刪除】舊的 load_model() 函數已被移除。
42
  # -----------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # -----------------------------------------------------------------------
45
  # 3.2. 日語 G2P 輔助函數 (此檔案最核心的修改)
46
+ # 【保持不變】
47
  # -----------------------------------------------------------------------
48
  def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
49
  if not mecab_tagger:
 
60
 
61
  phonemes_str = pyopenjtalk.g2p(word, kana=False)
62
 
 
 
63
  cleaned_phonemes = re.sub(r'\s+', ' ', phonemes_str).strip()
64
 
65
  phoneme_list = cleaned_phonemes.split()
 
72
 
73
  # -----------------------------------------------------------------------
74
  # 3.3. 音素切分函數 (用於處理 ASR 的輸出)
75
+ # 【保持不變】
76
  # -----------------------------------------------------------------------
77
  def _tokenize_asr_output(phoneme_string: str) -> list:
78
  """
 
83
 
84
  # -----------------------------------------------------------------------
85
  # 3.4. 核心分析函數 (主入口)
86
+ # 【已修改】將模型載入和快取邏輯合併至此。
87
  # -----------------------------------------------------------------------
88
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
89
  """
90
  接收音訊檔案路徑和目標日語句子,回傳詳細的發音分析字典。
91
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
92
  """
93
+ # 檢查快取中是否已有模型,如果沒有則載入
94
+ if "model" not in cache:
95
+ print(f"快取未命中 (ASR_jp_jp)。正在載入模型 '{MODEL_NAME}'...")
96
+ try:
97
+ # 載入模型並存入此函數的快取字典
98
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
99
+ cache["model"] = HubertForCTC.from_pretrained(MODEL_NAME) # <-- 使用 HubertForCTC
100
+ cache["model"].to(DEVICE)
101
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
102
+ except Exception as e:
103
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
104
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
105
+
106
+ # 從此函數的獨立快取中獲取模型和處理器
107
+ processor = cache["processor"]
108
+ model = cache["model"]
109
+
110
+ # --- 以下為原始分析邏輯,保持不變 ---
111
 
112
  # 【關鍵步驟 1: G2P】
 
113
  target_words_original, target_ipa_by_word = _get_target_phonemes_by_word(target_sentence)
114
 
 
115
  if not target_words_original:
116
  print("警告: G2P 處理後目標句子為空。")
 
117
  return _format_to_json_structure([], target_sentence, [])
118
 
119
  # 【關鍵步驟 2: ASR】
 
120
  try:
121
  speech, sample_rate = sf.read(audio_file_path)
122
  if len(speech) == 0:
 
126
  if sample_rate != 16000:
127
  speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
128
 
 
129
  input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
130
  input_values = input_values.to(DEVICE)
131
  with torch.no_grad():
 
137
  raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
138
 
139
  # 【關鍵步驟 3: 對齊】
 
140
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
141
 
142
  # 【關鍵步驟 4: 格式化】
 
143
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
144
 
145
  # =======================================================================
146
  # 4. 對齊與格式化函數區 (Alignment & Formatting)
147
+ # 【保持不變】
148
  # =======================================================================
149
 
150
  # -----------------------------------------------------------------------
151
  # 4.1. 對齊函數 (語言無關)
152
  # -----------------------------------------------------------------------
 
153
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
154
  """
155
  使用動態規劃執行音素對齊。此函數是語言無關的。
156
  """
 
 
 
 
 
 
 
 
 
157
  user_phonemes = [char for word in user_phoneme_str.split() for char in word]
158
 
 
 
159
  target_phonemes_flat = []
160
  word_boundaries_indices = []
161
  current_idx = 0
162
  for word_ipa_tokens in target_words_ipa_tokenized:
 
163
  flat_tokens = [char for word in word_ipa_tokens for char in word]
164
  target_phonemes_flat.extend(flat_tokens)
165
  current_idx += len(flat_tokens)
166
  word_boundaries_indices.append(current_idx - 1)
167
 
 
168
  if not target_phonemes_flat:
169
  return []
170
 
 
235
  word_is_correct = True
236
  phonemes_data = []
237
 
 
238
  min_len = min(len(alignment['target']), len(alignment['user']))
239
  for j in range(min_len):
240
  target_phoneme = alignment['target'][j]
 
249
 
250
  if not is_match:
251
  word_is_correct = False
 
252
  if not (user_phoneme == '-' and target_phoneme == '-'):
253
  total_errors += 1
254
 
 
263
 
264
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
265
 
 
266
  if len(alignments) < len(original_words):
267
  for i in range(len(alignments), len(original_words)):
 
268
  _, missed_word_ipa_list = _get_target_phonemes_by_word(original_words[i])
269
 
270
  phonemes_data = []
271
+ if missed_word_ipa_list:
272
  for p_ipa in missed_word_ipa_list[0]:
273
  phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
274
  total_errors += 1
analyzer/ASR_nl_nl.py CHANGED
@@ -20,40 +20,17 @@ import unicodedata # 【保留】這是處理多語言音素的更優方案
20
  import re # 【保留】用於更準確地切分單詞
21
 
22
  # --- 2. 全域設定與模型載入 ---
 
 
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
25
 
26
  # 【關鍵修改 1:設定為荷蘭語 ASR 模型】
27
  MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
28
 
29
- processor = None
30
- model = None
31
-
32
- def load_model():
33
- """
34
- 載入荷蘭語 ASR 模型和對應的處理器。
35
- (此函數邏輯與 en_us.py 完全相同)
36
- """
37
- global processor, model
38
- if processor and model:
39
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
40
- return True
41
-
42
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
43
- try:
44
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
45
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
46
- model.to(DEVICE)
47
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
48
- return True
49
- except Exception as e:
50
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
51
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
52
-
53
  # --- 3. 智能 IPA 切分函數 ---
54
  # 【關鍵修改 2:保留更優越的通用切分邏輯】
55
- # 雖然此函數的實現比英文版的更複雜,但它更健壯且適用於包括荷蘭語在內的多種語言。
56
- # 這是為了「fit with Dutch」而必須保留的優化。
57
  def _tokenize_ipa(ipa_string: str) -> list:
58
  """
59
  將 IPA 字串智能地切分為音素列表,能正確處理帶有附加符號的組合字符。
@@ -64,7 +41,6 @@ def _tokenize_ipa(ipa_string: str) -> list:
64
  while i < len(s):
65
  current_char = s[i]
66
  i += 1
67
- # 檢查並組合後續的非間距標記 (例如變音符)
68
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
69
  current_char += s[i]
70
  i += 1
@@ -72,16 +48,31 @@ def _tokenize_ipa(ipa_string: str) -> list:
72
  return phonemes
73
 
74
  # --- 4. 核心分析函數 (主入口) ---
75
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
 
76
  """
77
  接收音訊檔案路徑和目標荷蘭語句子,回傳詳細的發音分析字典。
78
- (此函數結構與 en_us.py 完全對齊)
79
  """
80
- if not processor or not model:
81
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
 
83
  # 1. 準備目標音素 (G2P)
84
- # 使用正則表達式準確切分單詞,這比簡單的 .split() 更穩健
85
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
86
  cleaned_sentence = " ".join(target_words_original)
87
 
@@ -94,7 +85,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
94
  strip=True
95
  ).split()
96
 
97
- # 健壯性檢查:確保單詞和音素列表長度一致
98
  if len(target_words_original) != len(target_ipa_by_word_str):
99
  print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
100
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
@@ -122,7 +112,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
122
  predicted_ids = torch.argmax(logits, dim=-1)
123
 
124
  # 【關鍵修改 5:與 en_us.py 對齊,假設模型輸出是乾淨的,或在必要時清理】
125
- # 移除模型可能產生的分隔符 |,並確保也移除長音符號,以匹配目標音素的處理方式
126
  user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
127
 
128
  # 3. 執行對齊並格式化輸出
@@ -131,6 +120,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
131
 
132
 
133
  # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
134
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
135
  """
136
  使用動態規劃執行音素對齊。
@@ -157,16 +147,12 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
157
  i, j = len(user_phonemes), len(target_phonemes_flat)
158
  user_path, target_path = [], []
159
  while i > 0 or j > 0:
160
- # 使用與 en_us.py 相同的、更簡潔的回溯邏輯
161
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
162
 
163
- # 優先匹配/替換
164
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
165
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
166
- # 其次是刪除 (user 多)
167
  elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
168
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
169
- # 最後是插入 (target 多)
170
  else:
171
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
172
 
@@ -192,6 +178,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
192
  return alignments_by_word
193
 
194
  # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
195
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
196
  """
197
  將對齊結果格式化為最終的 JSON 結構。
@@ -222,7 +209,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
222
 
223
  if not is_match:
224
  word_is_correct = False
225
- # 只有在不是「目標和用戶都為空」的情況下才計為錯誤
226
  if not (user_phoneme == '-' and target_phoneme == '-'):
227
  total_errors += 1
228
 
@@ -237,7 +223,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
237
 
238
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
239
 
240
- # 處理使用者漏講單詞的情況
241
  if len(alignments) < len(original_words):
242
  for i in range(len(alignments), len(original_words)):
243
  # 【關鍵修改 6:確保此處的 G2P 語言和符號清理也保持一致】
 
20
  import re # 【保留】用於更準確地切分單詞
21
 
22
  # --- 2. 全域設定與模型載入 ---
23
+ # 【已修改】移除了全域的 processor 和 model 變數。
24
+ # 【已修改】刪除了舊的 load_model() 函數。
25
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
26
  print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
27
 
28
  # 【關鍵修改 1:設定為荷蘭語 ASR 模型】
29
  MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # --- 3. 智能 IPA 切分函數 ---
32
  # 【關鍵修改 2:保留更優越的通用切分邏輯】
33
+ # 【保持不變】
 
34
  def _tokenize_ipa(ipa_string: str) -> list:
35
  """
36
  將 IPA 字串智能地切分為音素列表,能正確處理帶有附加符號的組合字符。
 
41
  while i < len(s):
42
  current_char = s[i]
43
  i += 1
 
44
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
45
  current_char += s[i]
46
  i += 1
 
48
  return phonemes
49
 
50
  # --- 4. 核心分析函數 (主入口) ---
51
+ # 【已修改】將模型載入和快取邏輯合併至此。
52
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
53
  """
54
  接收音訊檔案路徑和目標荷蘭語句子,回傳詳細的發音分析字典。
55
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
56
  """
57
+ # 檢查快取中是否已有模型,如果沒有則載入
58
+ if "model" not in cache:
59
+ print(f"快取未命中 (ASR_nl_nl)。正在載入模型 '{MODEL_NAME}'...")
60
+ try:
61
+ # 載入模型並存入此函數的快取字典
62
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
63
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
64
+ cache["model"].to(DEVICE)
65
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
66
+ except Exception as e:
67
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
68
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
69
+
70
+ # 從此函數的獨立快取中獲取模型和處理器
71
+ processor = cache["processor"]
72
+ model = cache["model"]
73
 
74
+ # --- 以下為原始分析邏輯,保持不變 ---
75
  # 1. 準備目標音素 (G2P)
 
76
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
77
  cleaned_sentence = " ".join(target_words_original)
78
 
 
85
  strip=True
86
  ).split()
87
 
 
88
  if len(target_words_original) != len(target_ipa_by_word_str):
89
  print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
90
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
 
112
  predicted_ids = torch.argmax(logits, dim=-1)
113
 
114
  # 【關鍵修改 5:與 en_us.py 對齊,假設模型輸出是乾淨的,或在必要時清理】
 
115
  user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
116
 
117
  # 3. 執行對齊並格式化輸出
 
120
 
121
 
122
  # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
123
+ # 【保持不變】
124
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
125
  """
126
  使用動態規劃執行音素對齊。
 
147
  i, j = len(user_phonemes), len(target_phonemes_flat)
148
  user_path, target_path = [], []
149
  while i > 0 or j > 0:
 
150
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
151
 
 
152
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
153
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
 
154
  elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
155
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
 
156
  else:
157
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
158
 
 
178
  return alignments_by_word
179
 
180
  # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
181
+ # 【保持不變】
182
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
183
  """
184
  將對齊結果格式化為最終的 JSON 結構。
 
209
 
210
  if not is_match:
211
  word_is_correct = False
 
212
  if not (user_phoneme == '-' and target_phoneme == '-'):
213
  total_errors += 1
214
 
 
223
 
224
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
225
 
 
226
  if len(alignments) < len(original_words):
227
  for i in range(len(alignments), len(original_words)):
228
  # 【關鍵修改 6:確保此處的 G2P 語言和符號清理也保持一致】
analyzer/ASR_pt_br.py CHANGED
@@ -20,40 +20,17 @@ import unicodedata # 【保留】這是處理葡萄牙語鼻音等音素的更
20
  import re # 【保留】用於更準確地切分單詞
21
 
22
  # --- 2. 全域設定與模型載入 ---
 
 
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
25
 
26
  # 【關鍵修改 1:設定為葡萄牙語 ASR 模型】
27
  MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
28
 
29
- processor = None
30
- model = None
31
-
32
- def load_model():
33
- """
34
- 載入葡萄牙語 ASR 模型和對應的處理器。
35
- (此函數邏輯與 en_us.py 完全相同)
36
- """
37
- global processor, model
38
- if processor and model:
39
- print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
40
- return True
41
-
42
- print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
43
- try:
44
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
45
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
46
- model.to(DEVICE)
47
- print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
48
- return True
49
- except Exception as e:
50
- print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
51
- raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
52
-
53
  # --- 3. 智能 IPA 切分函數 ---
54
  # 【關鍵修改 2:保留更優越的通用切分邏輯】
55
- # 為了正確處理葡萄牙語的鼻化元音 (如 ɐ̃) 和塞擦音 (如 dʒ),
56
- # 必須保留這個比英文版更強大的切分函數。
57
  def _tokenize_ipa(ipa_string: str) -> list:
58
  """
59
  將 IPA 字串智能地切分為音素列表,能正確處理帶有附加符號的組合字符。
@@ -62,13 +39,11 @@ def _tokenize_ipa(ipa_string: str) -> list:
62
  s = ipa_string.replace(' ', '')
63
  i = 0
64
  while i < len(s):
65
- # 優先處理葡萄牙語中常見的雙字符塞擦音
66
  if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
67
  phonemes.append(s[i:i+2])
68
  i += 2
69
  continue
70
 
71
- # 處理基礎字符及其後續的非間距標記 (例如鼻化符 ~)
72
  current_char = s[i]
73
  i += 1
74
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
@@ -78,14 +53,30 @@ def _tokenize_ipa(ipa_string: str) -> list:
78
  return phonemes
79
 
80
  # --- 4. 核心分析函數 (主入口) ---
81
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
 
82
  """
83
  接收音訊檔案路徑和目標葡萄牙語句子,回傳詳細的發音分析字典。
84
- (此函數結構與 en_us.py 完全對齊)
85
  """
86
- if not processor or not model:
87
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
88
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  # 1. 準備目標音素 (G2P)
90
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
91
  cleaned_sentence = " ".join(target_words_original)
@@ -134,6 +125,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
134
 
135
 
136
  # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
137
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
138
  """
139
  使用動態規劃執行音素對齊。
@@ -185,6 +177,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
185
  return alignments_by_word
186
 
187
  # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
188
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
189
  """
190
  將對齊結果格式化為最終的 JSON 結構。
@@ -242,4 +235,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
242
  "total_target_phonemes": total_phonemes
243
  },
244
  "words": words_data
245
- }
 
20
  import re # 【保留】用於更準確地切分單詞
21
 
22
  # --- 2. 全域設定與模型載入 ---
23
+ # 【已修改】移除了全域的 processor 和 model 變數。
24
+ # 【已修改】刪除了舊的 load_model() 函數。
25
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
26
  print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
27
 
28
  # 【關鍵修改 1:設定為葡萄牙語 ASR 模型】
29
  MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # --- 3. 智能 IPA 切分函數 ---
32
  # 【關鍵修改 2:保留更優越的通用切分邏輯】
33
+ # 【保持不變】
 
34
  def _tokenize_ipa(ipa_string: str) -> list:
35
  """
36
  將 IPA 字串智能地切分為音素列表,能正確處理帶有附加符號的組合字符。
 
39
  s = ipa_string.replace(' ', '')
40
  i = 0
41
  while i < len(s):
 
42
  if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
43
  phonemes.append(s[i:i+2])
44
  i += 2
45
  continue
46
 
 
47
  current_char = s[i]
48
  i += 1
49
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
 
53
  return phonemes
54
 
55
  # --- 4. 核心分析函數 (主入口) ---
56
+ # 【已修改】將模型載入和快取邏輯合併至此。
57
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
58
  """
59
  接收音訊檔案路徑和目標葡萄牙語句子,回傳詳細的發音分析字典。
60
+ 模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
61
  """
62
+ # 檢查快取中是否已有模型,如果沒有則載入
63
+ if "model" not in cache:
64
+ print(f"快取未命中 (ASR_pt_br)。正在載入模型 '{MODEL_NAME}'...")
65
+ try:
66
+ # 載入模型並存入此函數的快取字典
67
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
68
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
69
+ cache["model"].to(DEVICE)
70
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
71
+ except Exception as e:
72
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
73
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
74
+
75
+ # 從此函數的獨立快取中獲取模型和處理器
76
+ processor = cache["processor"]
77
+ model = cache["model"]
78
+
79
+ # --- 以下為原始分析邏輯,保持不變 ---
80
  # 1. 準備目標音素 (G2P)
81
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
82
  cleaned_sentence = " ".join(target_words_original)
 
125
 
126
 
127
  # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
128
+ # 【保持不變】
129
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
130
  """
131
  使用動態規劃執行音素對齊。
 
177
  return alignments_by_word
178
 
179
  # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
180
+ # 【保持不變】
181
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
182
  """
183
  將對齊結果格式化為最終的 JSON 結構。
 
235
  "total_target_phonemes": total_phonemes
236
  },
237
  "words": words_data
238
+ }
main.py CHANGED
@@ -77,7 +77,6 @@ async def lifespan(app: FastAPI):
77
  try:
78
  print(f"--- Loading model for language: {lang} ---")
79
  analyzer_module = importlib.import_module(f"analyzer.ASR_{lang}")
80
- analyzer_module.load_model()
81
  ANALYZERS[lang] = analyzer_module
82
  print(f"--- Model for {lang} loaded successfully. ---")
83
  except Exception as e:
@@ -127,7 +126,6 @@ def get_analyzer_module(language: str):
127
  print(f"'{language}' not in cache. Loading on-demand (development mode)...")
128
  try:
129
  analyzer_module = importlib.import_module(f"analyzer.ASR_{language}")
130
- analyzer_module.load_model()
131
  ANALYZERS[language] = analyzer_module
132
  print(f"'{language}' analyzer loaded and cached successfully.")
133
  return analyzer_module
 
77
  try:
78
  print(f"--- Loading model for language: {lang} ---")
79
  analyzer_module = importlib.import_module(f"analyzer.ASR_{lang}")
 
80
  ANALYZERS[lang] = analyzer_module
81
  print(f"--- Model for {lang} loaded successfully. ---")
82
  except Exception as e:
 
126
  print(f"'{language}' not in cache. Loading on-demand (development mode)...")
127
  try:
128
  analyzer_module = importlib.import_module(f"analyzer.ASR_{language}")
 
129
  ANALYZERS[language] = analyzer_module
130
  print(f"'{language}' analyzer loaded and cached successfully.")
131
  return analyzer_module