Spaces:

HK0712
/

FYP_ASR_Service

Running

App Files Files Community

HK0712 commited on 25 days ago

Commit

aa9eeec

1 Parent(s): c36c7d7

-

Browse files

Files changed (1) hide show

analyzer/ASR_en_us.py +56 -132

analyzer/ASR_en_us.py CHANGED Viewed

@@ -1,89 +1,50 @@
 import torch
 import soundfile as sf
 import librosa
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import os
-import json
-import epitran
 from phonemizer import phonemize
 import numpy as np
 from datetime import datetime, timezone
-import re
-# --- 1. 全域設定 (已修改) ---
-# 移除了全域的 processor 和 model 變數，只保留常數。
-MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
-# 在檔案頂端或接近全域設定區新增 lexicon 與 Epitran 初始化
-LEXICON_PATH = os.path.join(os.path.dirname(__file__), "lexicon_en_us.json")
-try:
-    if os.path.exists(LEXICON_PATH):
-        with open(LEXICON_PATH, "r", encoding="utf-8") as f:
-            LEXICON = json.load(f)
-    else:
-        LEXICON = {}
-except Exception:
-    LEXICON = {}
-def _save_lexicon():
-    try:
-        with open(LEXICON_PATH, "w", encoding="utf-8") as f:
-            json.dump(LEXICON, f, ensure_ascii=False, indent=2)
-    except Exception:
-        pass
-# 初始化 Epitran（記憶體 lexicon，不寫 JSON）
-try:
-    epi = epitran.Epitran("eng-Latn")
-    print("INFO: Epitran initialized for English (eng-Latn)")
-except Exception as e:
-    print(f"WARN: Epitran init failed for en_us: {e}")
-    epi = None
-def _get_word_ipa(word: str, cache: dict) -> str:
     """
-    以 cache 為快取容器（key: 'lexicon_en_us'），字典優先、epitran 優先、espeak 備援。
-    不寫檔案，僅記在記憶體 cache 中。
-    回傳 IPA 字串（可能包含多字元 token），一個 word -> 一個 IPA 字串保證。
     """
-    if not word or not word.strip():
-        return ""
-    lex = cache.setdefault("lexicon_en_us", {})
-    key = word.strip().lower()
-    if key in lex:
-        return lex[key]
-    ipa = ""
-    # 1) epitran 優先（逐字）
-    try:
-        if epi:
-            ipa = epi.transliterate(word).strip()
-    except Exception:
-        ipa = ""
-    # 2) 若 epitran 無效或回傳空字串，使用 phonemizer/espeak 單字呼叫作為備援
-    if not ipa:
-        try:
-            ipa = phonemize(word, language='en-us', backend='espeak', with_stress=True, strip=True)
-            ipa = ipa.strip()
-        except Exception:
-            ipa = ""
-    # 3) 最後 fallback：直接使用 word characters（保證回傳非 None）
-    if ipa is None or ipa == "":
-        ipa = "".join(list(word))
-    lex[key] = ipa
-    return ipa
-# --- 2. 智能 IPA 切分函數 (保持不變) ---
 MULTI_CHAR_PHONEMES = {
-    'tʃ', 'dʒ', # 輔音 (Affricates)
-    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
-    'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
 }
 def _tokenize_ipa(ipa_string: str) -> list:
@@ -102,23 +63,20 @@ def _tokenize_ipa(ipa_string: str) -> list:
             i += 1
     return phonemes
-# --- 3. 核心分析函數 (主入口) (已修改) ---
-# 刪除了舊的 load_model() 函數，並將其邏輯合併至此。
-def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
     模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
-    if cache is None:
-        cache = {}
     # 檢查快取中是否已有模型，如果沒有則載入
     if "model" not in cache:
-        print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
         try:
             # 載入模型並存入此函數的快取字典
-            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
             cache["model"].to(DEVICE)
             print(f"模型 '{MODEL_NAME}' 已載入並快取。")
         except Exception as e:
@@ -130,15 +88,13 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> d
     model = cache["model"]
     # --- 以下為原始分析邏輯，保持不變 ---
-    # 取每個詞的 IPA（逐字呼叫），保證 1 word = 1 IPA entry（no sentence-level phonemize）
-    words = target_sentence.split()
-    target_ipa_by_word = []
-    for w in words:
-        ipa_str = _get_word_ipa(w, cache)
-        cleaned = ipa_str.replace('ˌ', '').replace('ˈ', '').replace('ː', '')
-        target_ipa_by_word.append(_tokenize_ipa(cleaned))
-    target_words_original = words
     try:
         speech, sample_rate = sf.read(audio_file_path)
@@ -152,26 +108,21 @@ def analyze(audio_file_path: str, target_sentence: str, cache: dict = None) -> d
     with torch.no_grad():
         logits = model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
-    # 1) 將 ASR 解碼成文字
-    decoded_text = processor.decode(predicted_ids[0]).strip()
-    # 2) 基本清理（去掉標點，但保留單字內的撇號與連字）
-    decoded_text = re.sub(r"[^\w\s'-]", "", decoded_text)
-    # 3) 逐字轉 IPA（使用記憶體 cache、epitran 優先、espeak 備援）
-    asr_words = decoded_text.split()
-    user_ipa_word_tokens = []
-    for w in asr_words:
-        ipa_str = _get_word_ipa(w, cache)
-        cleaned = ipa_str.replace('ˌ', '').replace('ˈ', '').replace('ː', '')
-        user_ipa_word_tokens.append(_tokenize_ipa(cleaned))
-    # 4) 合併成供對齊使用的單一 IPA 字串（不含空格）
-    user_ipa_full = "".join("".join(toks) for toks in user_ipa_word_tokens)
     word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (保持不變) ---
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     (已修改) 使用新的切分邏輯執行音素對齊。
@@ -226,7 +177,8 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     return alignments_by_word
-# --- 5. 格式化函數 (保持不變) ---
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_phonemes = 0
     total_errors = 0
@@ -302,31 +254,3 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     }
     return final_result
-# 將原本的 _get_target_phonemes_by_word (或相等功能) 改為使用 lexicon 優先 + epitran 備援
-def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
-    """
-    針對 English (en_us) 的詞到音素處理：字典優先、Epitran 備援、快取至 lexicon_en_us.json。
-    回傳 (原始詞列表, 每個詞的音素列表)
-    """
-    if not text or not text.strip():
-        return [], []
-    # 簡單以空白分詞；若輸入無空白則逐字
-    words = text.split() if ' ' in text.strip() else list(text.strip())
-    target_words_original = []
-    target_ipa_by_word = []
-    for w in words:
-        w_stripped = w.strip()
-        if not w_stripped:
-            continue
-        try:
-            phonemes = _get_phonemes_for_word_en(w_stripped)
-        except Exception:
-            phonemes = list(w_stripped)
-        target_words_original.append(w_stripped)
-        target_ipa_by_word.append(phonemes)
-    return target_words_original, target_ipa_by_word

 import torch
 import soundfile as sf
 import librosa
+# 【【【【【 修改 #1：從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
+from transformers import AutoProcessor, AutoModelForCTC
 import os
 from phonemizer import phonemize
 import numpy as np
 from datetime import datetime, timezone
+# --- 全域設定 (已修改) ---
+# 移除了全域的 processor 和 model 變數。
+# 刪除了舊的 load_model() 函數。
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
+# 【【【【【 修改 #2：更新為最終選定的 KoelLabs 模型名稱 】】】】】
+MODEL_NAME = "KoelLabs/xlsr-english-01"
+# 【【【【【 新增程式碼 #1：為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
+# 【保持不變】
+def normalize_koel_ipa(raw_phonemes: list) -> list:
     """
+    將 KoelLabs 模型輸出的高級 IPA 序列，正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
     """
+    normalized_phonemes = []
+    for phoneme in raw_phonemes:
+        if not phoneme:
+            continue
+        base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
+        if base_phoneme == 'β':
+            base_phoneme = 'v'
+        elif base_phoneme in ['x', 'ɣ', 'ɦ']:
+            base_phoneme = 'h'
+        normalized_phonemes.append(base_phoneme)
+    return normalized_phonemes
+# --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
 MULTI_CHAR_PHONEMES = {
+    'tʃ', 'dʒ',
+    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
+    'ɪə', 'eə', 'ʊə', 'ər'
 }
 def _tokenize_ipa(ipa_string: str) -> list:
             i += 1
     return phonemes
+# --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
     模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
     # 檢查快取中是否已有模型，如果沒有則載入
     if "model" not in cache:
+        print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
         try:
+            # 【【【【【 修改 #3：使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
             # 載入模型並存入此函數的快取字典
+            cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
+            cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
             cache["model"].to(DEVICE)
             print(f"模型 '{MODEL_NAME}' 已載入並快取。")
         except Exception as e:
     model = cache["model"]
     # --- 以下為原始分析邏輯，保持不變 ---
+    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
+    target_ipa_by_word = [
+        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
+        for word in target_ipa_by_word_str
+    ]
+    target_words_original = target_sentence.split()
     try:
         speech, sample_rate = sf.read(audio_file_path)
     with torch.no_grad():
         logits = model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
+    # 【【【【【 修改 #4：在此處插入正規化步驟 】】】】】
+    # 【保持不變】
+    raw_user_ipa_str = processor.decode(predicted_ids[0])
+    raw_user_phonemes = raw_user_ipa_str.split(' ')
+    normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
+    user_ipa_full = "".join(normalized_user_phonemes)
     word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
+# --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     (已修改) 使用新的切分邏輯執行音素對齊。
     return alignments_by_word
+# --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_phonemes = 0
     total_errors = 0
     }
     return final_result