HK0712 commited on
Commit
b0b3dfc
·
1 Parent(s): 13d62bd

added french version

Browse files
Files changed (2) hide show
  1. analyzer/ASR_fr_fr.py +251 -0
  2. requirements.txt +3 -1
analyzer/ASR_fr_fr.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import os
6
+ from phonemizer import phonemize
7
+ import numpy as np
8
+ from datetime import datetime, timezone
9
+ import unicodedata
10
+ import re
11
+ import epitran
12
+
13
+ # --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
14
+ MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
15
+ MODEL_SAVE_PATH = "./ASRs/Cnam-LMSSC-wav2vec2-french-phonemizer-local"
16
+
17
+ processor = None
18
+ model = None
19
+
20
+ def load_model():
21
+ """
22
+ 在應用程式啟動時載入法語模型和處理器。
23
+ 如果模型已載入,則跳過。
24
+ """
25
+ global processor, model
26
+ if processor and model:
27
+ print("法語模型已載入,跳過。")
28
+ return True
29
+
30
+ print(f"正在準備法語 (fr-fr) ASR 模型 '{MODEL_NAME}'...")
31
+ try:
32
+ if not os.path.exists(MODEL_SAVE_PATH):
33
+ print(f"本地找不到模型,正在從 Hugging Face 下載並儲存...")
34
+ processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
35
+ model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
36
+ processor_to_save.save_pretrained(MODEL_SAVE_PATH)
37
+ model_to_save.save_pretrained(MODEL_SAVE_PATH)
38
+ print("模型已成功下載並儲存。")
39
+ else:
40
+ print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
41
+
42
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
43
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
44
+ print("法語 (fr-fr) 模型和處理器載入成功!")
45
+ return True
46
+ except Exception as e:
47
+ print(f"處理或載入 fr-fr 模型時發生錯誤: {e}")
48
+ raise RuntimeError(f"Failed to load fr-fr model: {e}")
49
+
50
+ def _tokenize_unicode_ipa(ipa_string: str) -> list:
51
+ """
52
+ 智能地切分包含 Unicode 組合字元的 IPA 字串。
53
+ """
54
+ phonemes = []
55
+ # 移除所有空格
56
+ s = ipa_string.replace(' ', '')
57
+
58
+ i = 0
59
+ while i < len(s):
60
+ # 獲取當前字元
61
+ current_char = s[i]
62
+ i += 1
63
+ # 檢查後續是否有連續的組合字元
64
+ while i < len(s) and unicodedata.category(s[i]) == 'Mn': # 'Mn' 代表非間距標記 (Non-Spacing Mark)
65
+ current_char += s[i]
66
+ i += 1
67
+ phonemes.append(current_char)
68
+ return phonemes
69
+
70
+ # --- 2. 核心分析函數 (主入口) (已修改為法語邏輯) ---
71
+ def analyze(audio_file_path: str, target_sentence: str) -> dict:
72
+ """
73
+ 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
74
+ 這是此模組的主要進入點。
75
+ """
76
+ if not processor or not model:
77
+ raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
78
+
79
+ # 【【【【【 關鍵修改 1:更智能地處理原始句子 】】】】】
80
+ # 使用正則表達式來準確地分割單詞,並自動忽略標點符號
81
+ target_words_original = re.findall(r"[\w'-]+", target_sentence)
82
+ # 將分割好的、乾淨的單詞重新組合,再傳給 phonemize
83
+ cleaned_sentence = " ".join(target_words_original)
84
+
85
+ # 使用 espeak 獲取法語目標音素
86
+ epi_fr = epitran.Epitran('fra-Latn')
87
+ target_ipa_full = epi_fr.transliterate(cleaned_sentence)
88
+ target_ipa_by_word_str = target_ipa_full.split()
89
+
90
+ # 【【【【【 確保兩個列表長度一致 】】】】】
91
+ if len(target_ipa_by_word_str) != len(target_words_original):
92
+ target_words_original = target_words_original[:len(target_ipa_by_word_str)]
93
+
94
+ # 對於法語,我們將特殊符號移除,並使用簡單的字元切分
95
+ target_ipa_by_word = [
96
+ _tokenize_unicode_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('‿', '').replace("'", ""))
97
+ for word in target_ipa_by_word_str
98
+ ]
99
+ # target_words_original 已經在上面被正確賦值了
100
+
101
+ try:
102
+ speech, sample_rate = sf.read(audio_file_path)
103
+ if sample_rate != 16000:
104
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
105
+ except Exception as e:
106
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
107
+
108
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
109
+ with torch.no_grad():
110
+ logits = model(input_values).logits
111
+ predicted_ids = torch.argmax(logits, dim=-1)
112
+ user_ipa_full = processor.decode(predicted_ids[0]).replace(' ', '')
113
+
114
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
115
+
116
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
117
+
118
+
119
+ # --- 3. 對齊函數 (已簡化切分邏輯) ---
120
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
121
+ """
122
+ 執行音素對齊。對法語使用簡單的字元切分。
123
+ """
124
+ # 對於 user 的音素字串,也使用簡單的字元切分
125
+ user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
126
+
127
+ target_phonemes_flat = []
128
+ word_boundaries_indices = []
129
+ current_idx = 0
130
+ for word_ipa_tokens in target_words_ipa_tokenized:
131
+ target_phonemes_flat.extend(word_ipa_tokens)
132
+ current_idx += len(word_ipa_tokens)
133
+ word_boundaries_indices.append(current_idx - 1)
134
+
135
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
136
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
137
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
138
+ for i in range(1, len(user_phonemes) + 1):
139
+ for j in range(1, len(target_phonemes_flat) + 1):
140
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
141
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
142
+
143
+ i, j = len(user_phonemes), len(target_phonemes_flat)
144
+ user_path, target_path = [], []
145
+ while i > 0 or j > 0:
146
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
147
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
148
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
149
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
150
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
151
+ else:
152
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
153
+
154
+ alignments_by_word = []
155
+ word_start_idx_in_path = 0
156
+ target_phoneme_counter_in_path = 0
157
+
158
+ for path_idx, p in enumerate(target_path):
159
+ if p != '-':
160
+ if target_phoneme_counter_in_path in word_boundaries_indices:
161
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
162
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
163
+
164
+ alignments_by_word.append({
165
+ "target": target_alignment,
166
+ "user": user_alignment
167
+ })
168
+
169
+ word_start_idx_in_path = path_idx + 1
170
+
171
+ target_phoneme_counter_in_path += 1
172
+
173
+ return alignments_by_word
174
+
175
+ # --- 4. 格式化函數 (語言無關,保持不變) ---
176
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
177
+ total_phonemes = 0
178
+ total_errors = 0
179
+ correct_words_count = 0
180
+ words_data = []
181
+
182
+ num_words_to_process = min(len(alignments), len(original_words))
183
+
184
+ for i in range(num_words_to_process):
185
+ alignment = alignments[i]
186
+ word_is_correct = True
187
+ phonemes_data = []
188
+
189
+ for j in range(len(alignment['target'])):
190
+ target_phoneme = alignment['target'][j]
191
+ user_phoneme = alignment['user'][j]
192
+ is_match = (user_phoneme == target_phoneme)
193
+
194
+ phonemes_data.append({
195
+ "target": target_phoneme,
196
+ "user": user_phoneme,
197
+ "isMatch": is_match
198
+ })
199
+
200
+ if not is_match:
201
+ word_is_correct = False
202
+ if not (user_phoneme == '-' and target_phoneme == '-'):
203
+ total_errors += 1
204
+
205
+ if word_is_correct:
206
+ correct_words_count += 1
207
+
208
+ words_data.append({
209
+ "word": original_words[i],
210
+ "isCorrect": word_is_correct,
211
+ "phonemes": phonemes_data
212
+ })
213
+
214
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
215
+
216
+ total_words = len(original_words)
217
+ if len(alignments) < total_words:
218
+ for i in range(len(alignments), total_words):
219
+ # 確保這裡也移除相關符號
220
+ missed_word_ipa_str = phonemize(original_words[i], language='fr-fr', backend='espeak', strip=True).replace('ˈ', '').replace('ˌ', '').replace('‿', '')
221
+ missed_word_ipa = _tokenize_unicode_ipa(missed_word_ipa_str)
222
+ phonemes_data = []
223
+ for p_ipa in missed_word_ipa:
224
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
225
+ total_errors += 1
226
+ total_phonemes += 1
227
+
228
+ words_data.append({
229
+ "word": original_words[i],
230
+ "isCorrect": False,
231
+ "phonemes": phonemes_data
232
+ })
233
+
234
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
235
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
236
+
237
+ final_result = {
238
+ "sentence": sentence,
239
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
240
+ "summary": {
241
+ "overallScore": round(overall_score, 1),
242
+ "totalWords": total_words,
243
+ "correctWords": correct_words_count,
244
+ "phonemeErrorRate": round(phoneme_error_rate, 2),
245
+ "total_errors": total_errors,
246
+ "total_target_phonemes": total_phonemes
247
+ },
248
+ "words": words_data
249
+ }
250
+
251
+ return final_result
requirements.txt CHANGED
@@ -7,4 +7,6 @@ soundfile
7
  librosa
8
  transformers
9
  phonemizer[espeak]
10
- numpy
 
 
 
7
  librosa
8
  transformers
9
  phonemizer[espeak]
10
+ numpy
11
+ epitran
12
+ g2p