HK0712 commited on
Commit
bb7c3cd
·
1 Parent(s): aa3ee73
Files changed (2) hide show
  1. analyzer/ASR_en_us.py +53 -18
  2. analyzer/ASR_en_us_v2.py +17 -52
analyzer/ASR_en_us.py CHANGED
@@ -1,23 +1,50 @@
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 
5
  import os
6
  from phonemizer import phonemize
7
  import numpy as np
8
  from datetime import datetime, timezone
9
 
10
- # --- 1. 全域設定 (已修改) ---
11
- # 移除了全域的 processor 和 model 變數,只保留常數。
12
- MODEL_NAME = "KoelLabs/xlsr-english-01"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
- print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
 
 
 
15
 
16
- # --- 2. 智能 IPA 切分函數 (保持不變) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  MULTI_CHAR_PHONEMES = {
18
- 'tʃ', 'dʒ', # 輔音 (Affricates)
19
- 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
20
- 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
21
  }
22
 
23
  def _tokenize_ipa(ipa_string: str) -> list:
@@ -36,8 +63,7 @@ def _tokenize_ipa(ipa_string: str) -> list:
36
  i += 1
37
  return phonemes
38
 
39
- # --- 3. 核心分析函數 (主入口) (已修改) ---
40
- # 刪除了舊的 load_model() 函數,並將其邏輯合併至此。
41
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
42
  """
43
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
@@ -45,11 +71,12 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
45
  """
46
  # 檢查快取中是否已有模型,如果沒有則載入
47
  if "model" not in cache:
48
- print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
49
  try:
 
50
  # 載入模型並存入此函數的快取字典
51
- cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
52
- cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
53
  cache["model"].to(DEVICE)
54
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
55
  except Exception as e:
@@ -81,14 +108,21 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
81
  with torch.no_grad():
82
  logits = model(input_values).logits
83
  predicted_ids = torch.argmax(logits, dim=-1)
84
- user_ipa_full = processor.decode(predicted_ids[0])
 
 
 
 
 
 
85
 
86
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
87
 
88
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
89
 
90
 
91
- # --- 4. 對齊函數 (保持不變) ---
 
92
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
93
  """
94
  (已修改) 使用新的切分邏輯執行音素對齊。
@@ -143,7 +177,8 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
143
 
144
  return alignments_by_word
145
 
146
- # --- 5. 格式化函數 (保持不變) ---
 
147
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
148
  total_phonemes = 0
149
  total_errors = 0
@@ -218,4 +253,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
218
  "words": words_data
219
  }
220
 
221
- return final_result
 
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
+ # 【【【【【 修改 #1:從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
  import os
7
  from phonemizer import phonemize
8
  import numpy as np
9
  from datetime import datetime, timezone
10
 
11
+ # --- 全域設定 (已修改) ---
12
+ # 移除了全域的 processor 和 model 變數。
13
+ # 刪除了舊的 load_model() 函數。
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
16
+
17
+ # 【【【【【 修改 #2:更新為最終選定的 KoelLabs 模型名稱 】】】】】
18
+ MODEL_NAME = "KoelLabs/xlsr-english-01"
19
 
20
+ # 【【【【【 新增程式碼 #1:為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
21
+ # 【保持不變】
22
+ def normalize_koel_ipa(raw_phonemes: list) -> list:
23
+ """
24
+ 將 KoelLabs 模型輸出的高級 IPA 序列,正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
25
+ """
26
+ normalized_phonemes = []
27
+ for phoneme in raw_phonemes:
28
+ if not phoneme:
29
+ continue
30
+
31
+ base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
32
+
33
+ if base_phoneme == 'β':
34
+ base_phoneme = 'v'
35
+ elif base_phoneme in ['x', 'ɣ', 'ɦ']:
36
+ base_phoneme = 'h'
37
+
38
+ normalized_phonemes.append(base_phoneme)
39
+
40
+ return normalized_phonemes
41
+
42
+ # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
43
+ # 【保持不變】
44
  MULTI_CHAR_PHONEMES = {
45
+ 'tʃ', 'dʒ',
46
+ 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
47
+ 'ɪə', 'eə', 'ʊə', 'ər'
48
  }
49
 
50
  def _tokenize_ipa(ipa_string: str) -> list:
 
63
  i += 1
64
  return phonemes
65
 
66
+ # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
 
67
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
68
  """
69
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
 
71
  """
72
  # 檢查快取中是否已有模型,如果沒有則載入
73
  if "model" not in cache:
74
+ print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
75
  try:
76
+ # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
77
  # 載入模型並存入此函數的快取字典
78
+ cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
79
+ cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
80
  cache["model"].to(DEVICE)
81
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
82
  except Exception as e:
 
108
  with torch.no_grad():
109
  logits = model(input_values).logits
110
  predicted_ids = torch.argmax(logits, dim=-1)
111
+
112
+ # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
113
+ # 【保持不變】
114
+ raw_user_ipa_str = processor.decode(predicted_ids[0])
115
+ raw_user_phonemes = raw_user_ipa_str.split(' ')
116
+ normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
117
+ user_ipa_full = "".join(normalized_user_phonemes)
118
 
119
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
120
 
121
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
122
 
123
 
124
+ # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
125
+ # 【保持不變】
126
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
127
  """
128
  (已修改) 使用新的切分邏輯執行音素對齊。
 
177
 
178
  return alignments_by_word
179
 
180
+ # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
181
+ # 【保持不變】
182
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
183
  total_phonemes = 0
184
  total_errors = 0
 
253
  "words": words_data
254
  }
255
 
256
+ return final_result
analyzer/ASR_en_us_v2.py CHANGED
@@ -1,50 +1,23 @@
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
- # 【【【【【 修改 #1:從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
5
- from transformers import AutoProcessor, AutoModelForCTC
6
  import os
7
  from phonemizer import phonemize
8
  import numpy as np
9
  from datetime import datetime, timezone
10
 
11
- # --- 全域設定 (已修改) ---
12
- # 移除了全域的 processor 和 model 變數。
13
- # 刪除了舊的 load_model() 函數。
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
- print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
16
 
17
- # 【【【【【 修改 #2:更新為最終選定的 KoelLabs 模型名稱 】】】】】
18
- MODEL_NAME = "KoelLabs/xlsr-english-01"
19
-
20
- # 【【【【【 新增程式碼 #1:為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
21
- # 【保持不變】
22
- def normalize_koel_ipa(raw_phonemes: list) -> list:
23
- """
24
- 將 KoelLabs 模型輸出的高級 IPA 序列,正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
25
- """
26
- normalized_phonemes = []
27
- for phoneme in raw_phonemes:
28
- if not phoneme:
29
- continue
30
-
31
- base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
32
-
33
- if base_phoneme == 'β':
34
- base_phoneme = 'v'
35
- elif base_phoneme in ['x', 'ɣ', 'ɦ']:
36
- base_phoneme = 'h'
37
-
38
- normalized_phonemes.append(base_phoneme)
39
-
40
- return normalized_phonemes
41
-
42
- # --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
43
- # 【保持不變】
44
  MULTI_CHAR_PHONEMES = {
45
- 'tʃ', 'dʒ',
46
- 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
47
- 'ɪə', 'eə', 'ʊə', 'ər'
48
  }
49
 
50
  def _tokenize_ipa(ipa_string: str) -> list:
@@ -63,7 +36,8 @@ def _tokenize_ipa(ipa_string: str) -> list:
63
  i += 1
64
  return phonemes
65
 
66
- # --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
 
67
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
68
  """
69
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
@@ -71,12 +45,11 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
71
  """
72
  # 檢查快取中是否已有模型,如果沒有則載入
73
  if "model" not in cache:
74
- print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
75
  try:
76
- # 【【【【【 修改 #3:使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
77
  # 載入模型並存入此函數的快取字典
78
- cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
79
- cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
80
  cache["model"].to(DEVICE)
81
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
82
  except Exception as e:
@@ -108,21 +81,14 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dic
108
  with torch.no_grad():
109
  logits = model(input_values).logits
110
  predicted_ids = torch.argmax(logits, dim=-1)
111
-
112
- # 【【【【【 修改 #4:在此處插入正規化步驟 】】】】】
113
- # 【保持不變】
114
- raw_user_ipa_str = processor.decode(predicted_ids[0])
115
- raw_user_phonemes = raw_user_ipa_str.split(' ')
116
- normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
117
- user_ipa_full = "".join(normalized_user_phonemes)
118
 
119
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
120
 
121
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
122
 
123
 
124
- # --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
125
- # 【保持不變】
126
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
127
  """
128
  (已修改) 使用新的切分邏輯執行音素對齊。
@@ -177,8 +143,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
177
 
178
  return alignments_by_word
179
 
180
- # --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
181
- # 【保持不變】
182
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
183
  total_phonemes = 0
184
  total_errors = 0
 
1
  import torch
2
  import soundfile as sf
3
  import librosa
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 
5
  import os
6
  from phonemizer import phonemize
7
  import numpy as np
8
  from datetime import datetime, timezone
9
 
10
+ # --- 1. 全域設定 (已修改) ---
11
+ # 移除了全域的 processor 和 model 變數,只保留常數。
12
+ MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
15
 
16
+ # --- 2. 智能 IPA 切分函數 (保持不變) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  MULTI_CHAR_PHONEMES = {
18
+ 'tʃ', 'dʒ', # 輔音 (Affricates)
19
+ 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
20
+ 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
21
  }
22
 
23
  def _tokenize_ipa(ipa_string: str) -> list:
 
36
  i += 1
37
  return phonemes
38
 
39
+ # --- 3. 核心分析函數 (主入口) (已修改) ---
40
+ # 刪除了舊的 load_model() 函數,並將其邏輯合併至此。
41
  def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
42
  """
43
  接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
 
45
  """
46
  # 檢查快取中是否已有模型,如果沒有則載入
47
  if "model" not in cache:
48
+ print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
49
  try:
 
50
  # 載入模型並存入此函數的快取字典
51
+ cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
52
+ cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
53
  cache["model"].to(DEVICE)
54
  print(f"模型 '{MODEL_NAME}' 已載入並快取。")
55
  except Exception as e:
 
81
  with torch.no_grad():
82
  logits = model(input_values).logits
83
  predicted_ids = torch.argmax(logits, dim=-1)
84
+ user_ipa_full = processor.decode(predicted_ids[0])
 
 
 
 
 
 
85
 
86
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
87
 
88
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
89
 
90
 
91
+ # --- 4. 對齊函數 (保持不變) ---
 
92
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
93
  """
94
  (已修改) 使用新的切分邏輯執行音素對齊。
 
143
 
144
  return alignments_by_word
145
 
146
+ # --- 5. 格式化函數 (保持不變) ---
 
147
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
148
  total_phonemes = 0
149
  total_errors = 0