HK0712 commited on
Commit
67ff0ae
·
1 Parent(s): 4701923

ADD: dutch

Browse files
Files changed (1) hide show
  1. analyzer/ASR_nl_nl.py +205 -0
analyzer/ASR_nl_nl.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================================================
2
+ # analyzer/ASR_nl_nl.py
3
+ # 荷蘭語發音分析器
4
+ # 最終修正版 - 使用用戶指定的正確模型
5
+ # =======================================================================
6
+
7
+ # 1. 匯入區 (Imports)
8
+ import torch
9
+ import soundfile as sf
10
+ import librosa
11
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
12
+ import os
13
+ from phonemizer import phonemize
14
+ import numpy as np
15
+ from datetime import datetime, timezone
16
+ import re
17
+ import unicodedata
18
+
19
+ # =======================================================================
20
+ # 2. 全域變數與配置區
21
+ # =======================================================================
22
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
+ print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
24
+
25
+ # 【【【【【 最終的、決定性的修正 】】】】】
26
+ # 使用用戶指定的、正確的荷蘭語音素模型
27
+ MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
28
+
29
+ processor = None
30
+ model = None
31
+
32
+ # =======================================================================
33
+ # 3. 核心業務邏輯區
34
+ # =======================================================================
35
+
36
+ # -----------------------------------------------------------------------
37
+ # 3.1. 模型載入函數 (邏輯不變)
38
+ # -----------------------------------------------------------------------
39
+ def load_model():
40
+ """
41
+ 載入荷蘭語 ASR 模型和對應的處理器。
42
+ """
43
+ global processor, model
44
+ if processor and model:
45
+ print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
46
+ return True
47
+
48
+ print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
49
+ try:
50
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
51
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
52
+ model.to(DEVICE)
53
+ print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
54
+ return True
55
+ except Exception as e:
56
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
57
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
58
+
59
+ # -----------------------------------------------------------------------
60
+ # 3.2. 通用 IPA 切分函數 (邏輯不變)
61
+ # -----------------------------------------------------------------------
62
+ def _tokenize_ipa(ipa_string: str) -> list:
63
+ """
64
+ 將 IPA 字串智能地切分為音素列表,可以正確處理任何語言的組合字符。
65
+ """
66
+ phonemes = []
67
+ s = ipa_string.replace(' ', '')
68
+ i = 0
69
+ while i < len(s):
70
+ current_char = s[i]
71
+ i += 1
72
+ while i < len(s) and unicodedata.category(s[i]) == 'Mn':
73
+ current_char += s[i]
74
+ i += 1
75
+ phonemes.append(current_char)
76
+ return phonemes
77
+
78
+ # -----------------------------------------------------------------------
79
+ # 3.3. 核心分析函數 (邏輯不變)
80
+ # -----------------------------------------------------------------------
81
+ def analyze(audio_file_path: str, target_sentence: str) -> dict:
82
+ """
83
+ 接收音訊檔案路徑和目標荷蘭語句子,回傳詳細的發音分析字典。
84
+ """
85
+ if not processor or not model:
86
+ raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
87
+
88
+ target_words_original = re.findall(r"[\w'-]+", target_sentence)
89
+ cleaned_sentence = " ".join(target_words_original)
90
+
91
+ target_ipa_by_word_str = phonemize(cleaned_sentence, language='nl', backend='espeak', with_stress=True, strip=True).split()
92
+
93
+ if len(target_words_original) != len(target_ipa_by_word_str):
94
+ print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。")
95
+ min_len = min(len(target_words_original), len(target_ipa_by_word_str))
96
+ target_words_original = target_words_original[:min_len]
97
+ target_ipa_by_word_str = target_ipa_by_word_str[:min_len]
98
+
99
+ target_ipa_by_word = [
100
+ _tokenize_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('ː', ''))
101
+ for word in target_ipa_by_word_str
102
+ ]
103
+
104
+ try:
105
+ speech, sample_rate = sf.read(audio_file_path)
106
+ if sample_rate != 16000:
107
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
108
+ except Exception as e:
109
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
110
+
111
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
112
+ input_values = input_values.to(DEVICE)
113
+ with torch.no_grad():
114
+ logits = model(input_values).logits
115
+ predicted_ids = torch.argmax(logits, dim=-1)
116
+ user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '')
117
+
118
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
119
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
120
+
121
+
122
+ # =======================================================================
123
+ # 4. 對齊與格式化函數區 (語言無關,邏輯不變)
124
+ # =======================================================================
125
+
126
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
127
+ user_phonemes = _tokenize_ipa(user_phoneme_str)
128
+ target_phonemes_flat = [p for word in target_words_ipa_tokenized for p in word]
129
+ word_boundaries_indices = np.cumsum([len(word) for word in target_words_ipa_tokenized]) - 1
130
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
131
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
132
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
133
+ for i in range(1, len(user_phonemes) + 1):
134
+ for j in range(1, len(target_phonemes_flat) + 1):
135
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
136
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
137
+ i, j = len(user_phonemes), len(target_phonemes_flat)
138
+ user_path, target_path = [], []
139
+ while i > 0 or j > 0:
140
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
141
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
142
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
143
+ elif i > 0 and (j == 0 or dp[i][j] == dp[i-1][j] + 1):
144
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
145
+ elif j > 0 and (i == 0 or dp[i][j] == dp[i][j-1] + 1):
146
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
147
+ else: break
148
+ alignments_by_word = []
149
+ word_start_idx_in_path = 0
150
+ target_phoneme_counter_in_path = 0
151
+ word_boundary_iter = iter(word_boundaries_indices)
152
+ current_word_boundary = next(word_boundary_iter, -1)
153
+ for path_idx, p in enumerate(target_path):
154
+ if p != '-':
155
+ if target_phoneme_counter_in_path == current_word_boundary:
156
+ alignments_by_word.append({
157
+ "target": target_path[word_start_idx_in_path : path_idx + 1],
158
+ "user": user_path[word_start_idx_in_path : path_idx + 1]
159
+ })
160
+ word_start_idx_in_path = path_idx + 1
161
+ current_word_boundary = next(word_boundary_iter, -1)
162
+ target_phoneme_counter_in_path += 1
163
+ return alignments_by_word
164
+
165
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
166
+ total_phonemes, total_errors, correct_words_count = 0, 0, 0
167
+ words_data = []
168
+ num_words_to_process = min(len(alignments), len(original_words))
169
+ for i in range(num_words_to_process):
170
+ alignment = alignments[i]
171
+ word_is_correct = True
172
+ phonemes_data = []
173
+ min_len = min(len(alignment['target']), len(alignment['user']))
174
+ for j in range(min_len):
175
+ target_phoneme, user_phoneme = alignment['target'][j], alignment['user'][j]
176
+ is_match = (user_phoneme == target_phoneme)
177
+ phonemes_data.append({"target": target_phoneme, "user": user_phoneme, "isMatch": is_match})
178
+ if not is_match:
179
+ word_is_correct = False
180
+ if not (user_phoneme == '-' and target_phoneme == '-'): total_errors += 1
181
+ if word_is_correct: correct_words_count += 1
182
+ words_data.append({"word": original_words[i], "isCorrect": word_is_correct, "phonemes": phonemes_data})
183
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
184
+ if len(alignments) < len(original_words):
185
+ for i in range(len(alignments), len(original_words)):
186
+ missed_word_ipa_str = phonemize(original_words[i], language='nl', backend='espeak', strip=True).replace('ː', '')
187
+ missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
188
+ phonemes_data = []
189
+ for p_ipa in missed_word_ipa:
190
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
191
+ total_errors += 1
192
+ total_phonemes += 1
193
+ words_data.append({"word": original_words[i], "isCorrect": False, "phonemes": phonemes_data})
194
+ total_words = len(original_words)
195
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
196
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
197
+ return {
198
+ "sentence": sentence,
199
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
200
+ "summary": {
201
+ "overallScore": round(overall_score, 1), "totalWords": total_words, "correctWords": correct_words_count,
202
+ "phonemeErrorRate": round(phoneme_error_rate, 2), "total_errors": total_errors, "total_target_phonemes": total_phonemes
203
+ },
204
+ "words": words_data
205
+ }