HK0712 commited on
Commit
aa9eeec
·
1 Parent(s): c36c7d7
Files changed (1) hide show
  1. analyzer/ASR_en_us.py +56 -132
analyzer/ASR_en_us.py CHANGED
@@ -1,89 +1,50 @@
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 
5
  import os
6
- import json
7
- import epitran
8
  from phonemizer import phonemize
9
  import numpy as np
10
  from datetime import datetime, timezone
11
- import re
12
 
13
- # --- 1. 全域設定 (已修改) ---
14
- # 移除了全域的 processor 和 model 變數,只保留常數。
15
- MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
- print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
18
 
19
- # 在檔案頂端或接近全域設定區新增 lexicon Epitran 初始化
20
- LEXICON_PATH = os.path.join(os.path.dirname(__file__), "lexicon_en_us.json")
21
- try:
22
- if os.path.exists(LEXICON_PATH):
23
- with open(LEXICON_PATH, "r", encoding="utf-8") as f:
24
- LEXICON = json.load(f)
25
- else:
26
- LEXICON = {}
27
- except Exception:
28
- LEXICON = {}
29
 
30
- def _save_lexicon():
31
- try:
32
- with open(LEXICON_PATH, "w", encoding="utf-8") as f:
33
- json.dump(LEXICON, f, ensure_ascii=False, indent=2)
34
- except Exception:
35
- pass
36
-
37
- # 初始化 Epitran(記憶體 lexicon,不寫 JSON)
38
- try:
39
- epi = epitran.Epitran("eng-Latn")
40
- print("INFO: Epitran initialized for English (eng-Latn)")
41
- except Exception as e:
42
- print(f"WARN: Epitran init failed for en_us: {e}")
43
- epi = None
44
-
45
- def _get_word_ipa(word: str, cache: dict) -> str:
46
  """
47
- cache 為快取容器(key: 'lexicon_en_us'),字典優先、epitran 優先、espeak 備援。
48
- 不寫檔案,僅記在記憶體 cache 中。
49
- 回傳 IPA 字串(可能包含多字元 token),一個 word -> 一個 IPA 字串保證。
50
  """
51
- if not word or not word.strip():
52
- return ""
53
-
54
- lex = cache.setdefault("lexicon_en_us", {})
55
- key = word.strip().lower()
56
- if key in lex:
57
- return lex[key]
58
-
59
- ipa = ""
60
- # 1) epitran 優先(逐字)
61
- try:
62
- if epi:
63
- ipa = epi.transliterate(word).strip()
64
- except Exception:
65
- ipa = ""
66
-
67
- # 2) 若 epitran 無效或回傳空字串,使用 phonemizer/espeak 單字呼叫作為備援
68
- if not ipa:
69
- try:
70
- ipa = phonemize(word, language='en-us', backend='espeak', with_stress=True, strip=True)
71
- ipa = ipa.strip()
72
- except Exception:
73
- ipa = ""
74
-
75
- # 3) 最後 fallback:直接使用 word characters(保證回傳非 None)
76
- if ipa is None or ipa == "":
77
- ipa = "".join(list(word))
78
-
79
- lex[key] = ipa
80
- return ipa
81
 
82
- # --- 2. 智能 IPA 切分函數 (保持不變) ---
 
83
  MULTI_CHAR_PHONEMES = {
84
- 'tʃ', 'dʒ', # 輔音 (Affricates)
85
- 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
86
- 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
87
  }
88
 
89
  def _tokenize_ipa(ipa_string: str) -> list:
@@ -102,23 +63,20 @@ def _tokenize_ipa(ipa_string: str) -> list:
102
  i += 1
103
  return phonemes
104
 
105
- # --- 3. 核心分析函數 (主入口) (已修改) ---
106
- # 刪除了舊的 load_model() 函數,並將其邏輯合併至此。
107
- def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> dict:
108
  """
109
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
110
  模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
111
  """
112
- if cache is None:
113
- cache = {}
114
-
115
  # 檢查快取中是否已有模型,如果沒有則載入
116
  if "model" not in cache:
117
- print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
118
  try:
 
119
  # 載入模型並存入此函數的快取字典
120
- cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
121
- cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
122
  cache["model"].to(DEVICE)
123
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
124
  except Exception as e:
@@ -130,15 +88,13 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> d
130
  model = cache["model"]
131
 
132
  # --- 以下為原始分析邏輯,保持不變 ---
133
- # 取每個詞的 IPA(逐字呼叫),保證 1 word = 1 IPA entry(no sentence-level phonemize)
134
- words = target_sentence.split()
135
- target_ipa_by_word = []
136
- for w in words:
137
- ipa_str = _get_word_ipa(w, cache)
138
- cleaned = ipa_str.replace('ˌ', '').replace('ˈ', '').replace('ː', '')
139
- target_ipa_by_word.append(_tokenize_ipa(cleaned))
140
-
141
- target_words_original = words
142
 
143
  try:
144
  speech, sample_rate = sf.read(audio_file_path)
@@ -152,26 +108,21 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> d
152
  with torch.no_grad():
153
  logits = model(input_values).logits
154
  predicted_ids = torch.argmax(logits, dim=-1)
155
- # 1) 將 ASR 解碼成文字
156
- decoded_text = processor.decode(predicted_ids[0]).strip()
157
- # 2) 基本清理(去掉標點,但保留單字內的撇號與連字)
158
- decoded_text = re.sub(r"[^\w\s'-]", "", decoded_text)
159
- # 3) 逐字轉 IPA(使用記憶體 cache、epitran 優先、espeak 備援)
160
- asr_words = decoded_text.split()
161
- user_ipa_word_tokens = []
162
- for w in asr_words:
163
- ipa_str = _get_word_ipa(w, cache)
164
- cleaned = ipa_str.replace('ˌ', '').replace('ˈ', '').replace('ː', '')
165
- user_ipa_word_tokens.append(_tokenize_ipa(cleaned))
166
- # 4) 合併成供對齊使用的單一 IPA 字串(不含空格)
167
- user_ipa_full = "".join("".join(toks) for toks in user_ipa_word_tokens)
168
 
169
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
170
 
171
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
172
 
173
 
174
- # --- 4. 對齊函數 (保持不變) ---
 
175
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
176
  """
177
  (已修改) 使用新的切分邏輯執行音素對齊。
@@ -226,7 +177,8 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
226
 
227
  return alignments_by_word
228
 
229
- # --- 5. 格式化函數 (保持不變) ---
 
230
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
231
  total_phonemes = 0
232
  total_errors = 0
@@ -302,31 +254,3 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
302
  }
303
 
304
  return final_result
305
-
306
- # 將原本的 _get_target_phonemes_by_word (或相等功能) 改為使用 lexicon 優先 + epitran 備援
307
- def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
308
- """
309
- 針對 English (en_us) 的詞到音素處理:字典優先、Epitran 備援、快取至 lexicon_en_us.json。
310
- 回傳 (原始詞列表, 每個詞的音素列表)
311
- """
312
- if not text or not text.strip():
313
- return [], []
314
-
315
- # 簡單以空白分詞;若輸入無空白則逐字
316
- words = text.split() if ' ' in text.strip() else list(text.strip())
317
-
318
- target_words_original = []
319
- target_ipa_by_word = []
320
-
321
- for w in words:
322
- w_stripped = w.strip()
323
- if not w_stripped:
324
- continue
325
- try:
326
- phonemes = _get_phonemes_for_word_en(w_stripped)
327
- except Exception:
328
- phonemes = list(w_stripped)
329
- target_words_original.append(w_stripped)
330
- target_ipa_by_word.append(phonemes)
331
-
332
- return target_words_original, target_ipa_by_word
 
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
+ # 【【【【【 修改 #1:從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
  import os
 
 
7
  from phonemizer import phonemize
8
  import numpy as np
9
  from datetime import datetime, timezone
 
10
 
11
+ # --- 全域設定 (已修改) ---
12
+ # 移除了全域的 processor 和 model 變數。
13
+ # 刪除了舊的 load_model() 函數。
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
16
 
17
+ # 【【【【【 修改 #2:更新為最終選定的 KoelLabs 模型名稱 】】】】】
18
+ MODEL_NAME = "KoelLabs/xlsr-english-01"
 
 
 
 
 
 
 
 
19
 
20
+ # 【【【【【 新增程式碼 #1:為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
21
+ # 【保持不變】
22
+ def normalize_koel_ipa(raw_phonemes: list) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
24
+ KoelLabs 模型輸出的高級 IPA 序列,正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
 
 
25
  """
26
+ normalized_phonemes = []
27
+ for phoneme in raw_phonemes:
28
+ if not phoneme:
29
+ continue
30
+
31
+ base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
32
+
33
+ if base_phoneme == 'β':
34
+ base_phoneme = 'v'
35
+ elif base_phoneme in ['x', 'ɣ', 'ɦ']:
36
+ base_phoneme = 'h'
37
+
38
+ normalized_phonemes.append(base_phoneme)
39
+
40
+ return normalized_phonemes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
43
+ # 【保持不變】
44
  MULTI_CHAR_PHONEMES = {
45
+ 'tʃ', 'dʒ',
46
+ 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
47
+ 'ɪə', 'eə', 'ʊə', 'ər'
48
  }
49
 
50
  def _tokenize_ipa(ipa_string: str) -> list:
 
63
  i += 1
64
  return phonemes
65
 
66
+ # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
67
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
 
68
  """
69
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
70
  模型會被載入並儲存在此函數獨立的 'cache' 中,實現狀態隔離。
71
  """
 
 
 
72
  # 檢查快取中是否已有模型,如果沒有則載入
73
  if "model" not in cache:
74
+ print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
75
  try:
76
+ # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
77
  # 載入模型並存入此函數的快取字典
78
+ cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
79
+ cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
80
  cache["model"].to(DEVICE)
81
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
82
  except Exception as e:
 
88
  model = cache["model"]
89
 
90
  # --- 以下為原始分析邏輯,保持不變 ---
91
+ target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
92
+
93
+ target_ipa_by_word = [
94
+ _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
95
+ for word in target_ipa_by_word_str
96
+ ]
97
+ target_words_original = target_sentence.split()
 
 
98
 
99
  try:
100
  speech, sample_rate = sf.read(audio_file_path)
 
108
  with torch.no_grad():
109
  logits = model(input_values).logits
110
  predicted_ids = torch.argmax(logits, dim=-1)
111
+
112
+ # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
113
+ # 【保持不變】
114
+ raw_user_ipa_str = processor.decode(predicted_ids[0])
115
+ raw_user_phonemes = raw_user_ipa_str.split(' ')
116
+ normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
117
+ user_ipa_full = "".join(normalized_user_phonemes)
 
 
 
 
 
 
118
 
119
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
120
 
121
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
122
 
123
 
124
+ # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
125
+ # 【保持不變】
126
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
127
  """
128
  (已修改) 使用新的切分邏輯執行音素對齊。
 
177
 
178
  return alignments_by_word
179
 
180
+ # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
181
+ # 【保持不變】
182
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
183
  total_phonemes = 0
184
  total_errors = 0
 
254
  }
255
 
256
  return final_result