Spaces:

HK0712
/

FYP_ASR_Service

Running

App Files Files Community

HK0712 commited on Oct 8

Commit

a6526f0

1 Parent(s): cd315e7

CHANGE: keep load in ram

Browse files

Files changed (9) hide show

analyzer/ASR_de_de.py +221 -239
analyzer/ASR_en_us.py +27 -46
analyzer/ASR_en_us_v2.py +256 -277
analyzer/ASR_en_us_v3.py +0 -320
analyzer/ASR_fr_fr.py +25 -49
analyzer/ASR_jp_jp.py +27 -57
analyzer/ASR_nl_nl.py +25 -40
analyzer/ASR_pt_br.py +27 -34
main.py +0 -2

analyzer/ASR_de_de.py CHANGED Viewed

@@ -1,239 +1,221 @@
-# ASR_en_us.py
-import torch
-import soundfile as sf
-import librosa
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import os
-from phonemizer import phonemize
-import numpy as np
-from datetime import datetime, timezone
-# 【【【【【 新增程式碼 #1：自動檢測可用設備 】】】】】
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"INFO: ASR_de_de.py is configured to use device: {DEVICE}")
-# --- 1. 全域設定與模型載入函數 (保持不變) ---
-MODEL_NAME = "HK0712/Wav2Vec2_German_IPA"
-processor = None
-model = None
-def load_model():
-    """
-    (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
-    它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
-    try:
-        # 直接使用模型的線上名稱調用 from_pretrained
-        # 這就是魔法發生的地方！
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
-# --- 2. 智能 IPA 切分函數 (已更新) ---
-# 移除了包含 'ː' 的組合，因為我們將在源頭移除它
-MULTI_CHAR_PHONEMES = {
-    'aɪ', 'aʊ',
-    'dʒ', 'pf', 'ts', 'tʃ'
-}
-def _tokenize_ipa(ipa_string: str) -> list:
-    """
-    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
-    """
-    phonemes = []
-    i = 0
-    s = ipa_string.replace(' ', '')
-    while i < len(s):
-        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
-            phonemes.append(s[i:i+2])
-            i += 2
-        else:
-            phonemes.append(s[i])
-            i += 1
-    return phonemes
-# --- 3. 核心分析函數 (主入口) (已修改) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
-    """
-    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    這是此模組的主要進入點。
-    """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
-    target_ipa_by_word_str = phonemize(target_sentence, language='de', backend='espeak', with_stress=True, strip=True).split()
-    # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
-    # 在切分前，移除所有重音和長音符號，以匹配 ASR 的輸出特性
-    target_ipa_by_word = [
-        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
-        for word in target_ipa_by_word_str
-    ]
-    target_words_original = target_sentence.split()
-    try:
-        speech, sample_rate = sf.read(audio_file_path)
-        if sample_rate != 16000:
-            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
-    except Exception as e:
-        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
-    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
-    input_values = input_values.to(DEVICE)
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    user_ipa_full = processor.decode(predicted_ids[0])
-    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
-    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (與上一版相同) ---
-def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
-    """
-    (已修改) 使用新的切分邏輯執行音素對齊。
-    """
-    user_phonemes = _tokenize_ipa(user_phoneme_str)
-    target_phonemes_flat = []
-    word_boundaries_indices = []
-    current_idx = 0
-    for word_ipa_tokens in target_words_ipa_tokenized:
-        target_phonemes_flat.extend(word_ipa_tokens)
-        current_idx += len(word_ipa_tokens)
-        word_boundaries_indices.append(current_idx - 1)
-    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
-    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
-    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
-    for i in range(1, len(user_phonemes) + 1):
-        for j in range(1, len(target_phonemes_flat) + 1):
-            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
-            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
-    i, j = len(user_phonemes), len(target_phonemes_flat)
-    user_path, target_path = [], []
-    while i > 0 or j > 0:
-        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
-        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
-        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
-        else:
-            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
-    alignments_by_word = []
-    word_start_idx_in_path = 0
-    target_phoneme_counter_in_path = 0
-    for path_idx, p in enumerate(target_path):
-        if p != '-':
-            if target_phoneme_counter_in_path in word_boundaries_indices:
-                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
-                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
-                alignments_by_word.append({
-                    "target": target_alignment,
-                    "user": user_alignment
-                })
-                word_start_idx_in_path = path_idx + 1
-            target_phoneme_counter_in_path += 1
-    return alignments_by_word
-# --- 5. 格式化函數 (與上一版相同) ---
-def _format_to_json_structure(alignments, sentence, original_words) -> dict:
-    total_phonemes = 0
-    total_errors = 0
-    correct_words_count = 0
-    words_data = []
-    num_words_to_process = min(len(alignments), len(original_words))
-    for i in range(num_words_to_process):
-        alignment = alignments[i]
-        word_is_correct = True
-        phonemes_data = []
-        for j in range(len(alignment['target'])):
-            target_phoneme = alignment['target'][j]
-            user_phoneme = alignment['user'][j]
-            is_match = (user_phoneme == target_phoneme)
-            phonemes_data.append({
-                "target": target_phoneme,
-                "user": user_phoneme,
-                "isMatch": is_match
-            })
-            if not is_match:
-                word_is_correct = False
-                if not (user_phoneme == '-' and target_phoneme == '-'):
-                    total_errors += 1
-        if word_is_correct:
-            correct_words_count += 1
-        words_data.append({
-            "word": original_words[i],
-            "isCorrect": word_is_correct,
-            "phonemes": phonemes_data
-        })
-        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    total_words = len(original_words)
-    if len(alignments) < total_words:
-        for i in range(len(alignments), total_words):
-            # 確保這裡也移除 'ː'
-            missed_word_ipa_str = phonemize(original_words[i], language='de', backend='espeak', strip=True).replace('ː', '')
-            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
-            phonemes_data = []
-            for p_ipa in missed_word_ipa:
-                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
-                total_errors += 1
-                total_phonemes += 1
-            words_data.append({
-                "word": original_words[i],
-                "isCorrect": False,
-                "phonemes": phonemes_data
-            })
-    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
-    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
-    final_result = {
-        "sentence": sentence,
-        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
-        "summary": {
-            "overallScore": round(overall_score, 1),
-            "totalWords": total_words,
-            "correctWords": correct_words_count,
-            "phonemeErrorRate": round(phoneme_error_rate, 2),
-            "total_errors": total_errors,
-            "total_target_phonemes": total_phonemes
-        },
-        "words": words_data
-    }
-    return final_result

+import torch
+import soundfile as sf
+import librosa
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import os
+from phonemizer import phonemize
+import numpy as np
+from datetime import datetime, timezone
+# --- 1. 全域設定與模型載入函數 (已修改) ---
+# 移除了全域的 processor 和 model 變數。
+# 刪除了舊的 load_model() 函數。
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"INFO: ASR_de_de.py is configured to use device: {DEVICE}")
+MODEL_NAME = "HK0712/Wav2Vec2_German_IPA"
+# --- 2. 智能 IPA 切分函數 (保持不變) ---
+MULTI_CHAR_PHONEMES = {
+    'aɪ', 'aʊ',
+    'dʒ', 'pf', 'ts', 'tʃ'
+}
+def _tokenize_ipa(ipa_string: str) -> list:
+    """
+    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
+    """
+    phonemes = []
+    i = 0
+    s = ipa_string.replace(' ', '')
+    while i < len(s):
+        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
+            phonemes.append(s[i:i+2])
+            i += 2
+        else:
+            phonemes.append(s[i])
+            i += 1
+    return phonemes
+# --- 3. 核心分析函數 (主入口) (已修改) ---
+# 將模型載入和快取邏輯合併至此。
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
+    """
+    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
+    """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_de_de)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
+    target_ipa_by_word_str = phonemize(target_sentence, language='de', backend='espeak', with_stress=True, strip=True).split()
+    target_ipa_by_word = [
+        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
+        for word in target_ipa_by_word_str
+    ]
+    target_words_original = target_sentence.split()
+    try:
+        speech, sample_rate = sf.read(audio_file_path)
+        if sample_rate != 16000:
+            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
+    except Exception as e:
+        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
+    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
+    input_values = input_values.to(DEVICE)
+    with torch.no_grad():
+        logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    user_ipa_full = processor.decode(predicted_ids[0])
+    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
+    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
+# --- 4. 對齊函數 (保持不變) ---
+def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
+    """
+    (已修改) 使用新的切分邏輯執行音素對齊。
+    """
+    user_phonemes = _tokenize_ipa(user_phoneme_str)
+    target_phonemes_flat = []
+    word_boundaries_indices = []
+    current_idx = 0
+    for word_ipa_tokens in target_words_ipa_tokenized:
+        target_phonemes_flat.extend(word_ipa_tokens)
+        current_idx += len(word_ipa_tokens)
+        word_boundaries_indices.append(current_idx - 1)
+    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
+    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
+    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
+    for i in range(1, len(user_phonemes) + 1):
+        for j in range(1, len(target_phonemes_flat) + 1):
+            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
+            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
+    i, j = len(user_phonemes), len(target_phonemes_flat)
+    user_path, target_path = [], []
+    while i > 0 or j > 0:
+        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
+        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
+        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
+        else:
+            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
+    alignments_by_word = []
+    word_start_idx_in_path = 0
+    target_phoneme_counter_in_path = 0
+    for path_idx, p in enumerate(target_path):
+        if p != '-':
+            if target_phoneme_counter_in_path in word_boundaries_indices:
+                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
+                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
+                alignments_by_word.append({
+                    "target": target_alignment,
+                    "user": user_alignment
+                })
+                word_start_idx_in_path = path_idx + 1
+            target_phoneme_counter_in_path += 1
+    return alignments_by_word
+# --- 5. 格式化函數 (保持不變) ---
+def _format_to_json_structure(alignments, sentence, original_words) -> dict:
+    total_phonemes = 0
+    total_errors = 0
+    correct_words_count = 0
+    words_data = []
+    num_words_to_process = min(len(alignments), len(original_words))
+    for i in range(num_words_to_process):
+        alignment = alignments[i]
+        word_is_correct = True
+        phonemes_data = []
+        for j in range(len(alignment['target'])):
+            target_phoneme = alignment['target'][j]
+            user_phoneme = alignment['user'][j]
+            is_match = (user_phoneme == target_phoneme)
+            phonemes_data.append({
+                "target": target_phoneme,
+                "user": user_phoneme,
+                "isMatch": is_match
+            })
+            if not is_match:
+                word_is_correct = False
+                if not (user_phoneme == '-' and target_phoneme == '-'):
+                    total_errors += 1
+        if word_is_correct:
+            correct_words_count += 1
+        words_data.append({
+            "word": original_words[i],
+            "isCorrect": word_is_correct,
+            "phonemes": phonemes_data
+        })
+        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
+    total_words = len(original_words)
+    if len(alignments) < total_words:
+        for i in range(len(alignments), total_words):
+            missed_word_ipa_str = phonemize(original_words[i], language='de', backend='espeak', strip=True).replace('ː', '')
+            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
+            phonemes_data = []
+            for p_ipa in missed_word_ipa:
+                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
+                total_errors += 1
+                total_phonemes += 1
+            words_data.append({
+                "word": original_words[i],
+                "isCorrect": False,
+                "phonemes": phonemes_data
+            })
+    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
+    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
+    final_result = {
+        "sentence": sentence,
+        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
+        "summary": {
+            "overallScore": round(overall_score, 1),
+            "totalWords": total_words,
+            "correctWords": correct_words_count,
+            "phonemeErrorRate": round(phoneme_error_rate, 2),
+            "total_errors": total_errors,
+            "total_target_phonemes": total_phonemes
+        },
+        "words": words_data
+    }
+    return final_result

analyzer/ASR_en_us.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# ASR_en_us.py
 import torch
 import soundfile as sf
 import librosa
@@ -9,43 +7,13 @@ from phonemizer import phonemize
 import numpy as np
 from datetime import datetime, timezone
-# 【【【【【 新增程式碼 #1：自動檢測可用設備 】】】】】
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
-# --- 1. 全域設定與模型載入函數 (保持不變) ---
-MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
-processor = None
-model = None
-def load_model():
-    """
-    (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
-    它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
-    try:
-        # 直接使用模型的線上名稱調用 from_pretrained
-        # 這就是魔法發生的地方！
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
-# --- 2. 智能 IPA 切分函數 (已更新) ---
-# 移除了包含 'ː' 的組合，因為我們將在源頭移除它
 MULTI_CHAR_PHONEMES = {
     'tʃ', 'dʒ', # 輔音 (Affricates)
     'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
@@ -69,18 +37,32 @@ def _tokenize_ipa(ipa_string: str) -> list:
     return phonemes
 # --- 3. 核心分析函數 (主入口) (已修改) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    這是此模組的主要進入點。
     """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
     target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
-    # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
-    # 在切分前，移除所有重音和長音符號，以匹配 ASR 的輸出特性
     target_ipa_by_word = [
         _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
         for word in target_ipa_by_word_str
@@ -106,7 +88,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (與上一版相同) ---
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     (已修改) 使用新的切分邏輯執行音素對齊。
@@ -161,7 +143,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     return alignments_by_word
-# --- 5. 格式化函數 (與上一版相同) ---
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_phonemes = 0
     total_errors = 0
@@ -205,7 +187,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_words = len(original_words)
     if len(alignments) < total_words:
         for i in range(len(alignments), total_words):
-            # 確保這裡也移除 'ː'
             missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
             missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
             phonemes_data = []

 import torch
 import soundfile as sf
 import librosa
 import numpy as np
 from datetime import datetime, timezone
+# --- 1. 全域設定 (已修改) ---
+# 移除了全域的 processor 和 model 變數，只保留常數。
+MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_en_us.py is configured to use device: {DEVICE}")
+# --- 2. 智能 IPA 切分函數 (保持不變) ---
 MULTI_CHAR_PHONEMES = {
     'tʃ', 'dʒ', # 輔音 (Affricates)
     'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
     return phonemes
 # --- 3. 核心分析函數 (主入口) (已修改) ---
+# 刪除了舊的 load_model() 函數，並將其邏輯合併至此。
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_en_us)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
     target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
     target_ipa_by_word = [
         _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
         for word in target_ipa_by_word_str
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
+# --- 4. 對齊函數 (保持不變) ---
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     (已修改) 使用新的切分邏輯執行音素對齊。
     return alignments_by_word
+# --- 5. 格式化函數 (保持不變) ---
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_phonemes = 0
     total_errors = 0
     total_words = len(original_words)
     if len(alignments) < total_words:
         for i in range(len(alignments), total_words):
             missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
             missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
             phonemes_data = []

analyzer/ASR_en_us_v2.py CHANGED Viewed

@@ -1,277 +1,256 @@
-# ASR_en_us_v2.py
-import torch
-import soundfile as sf
-import librosa
-# 【【【【【 修改 #1：從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
-# 這是為了更好地適應 KoelLabs 模型推薦的用法，功能上與 Wav2Vec2Processor/ForCTC 相同，但更通用。
-from transformers import AutoProcessor, AutoModelForCTC
-import os
-from phonemizer import phonemize
-import numpy as np
-from datetime import datetime, timezone
-# --- 全域設定 ---
-# 保持不變
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# 檔案名稱已更新，您可以自行修改
-print(f"INFO: ASR_en_us_koel_final.py is configured to use device: {DEVICE}")
-# 【【【【【 修改 #2：更新為最終選定的 KoelLabs 模型名稱 】】】】】
-MODEL_NAME = "KoelLabs/xlsr-english-01"
-processor = None
-model = None
-# 【【【【【 新增程式碼 #1：為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
-def normalize_koel_ipa(raw_phonemes: list) -> list:
-    """
-    將 KoelLabs 模型輸出的高級 IPA 序列，正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
-    """
-    normalized_phonemes = []
-    for phoneme in raw_phonemes:
-        if not phoneme: # 跳過可能的空字串
-            continue
-        # 1. 去掉送氣、鼻化、清音等附加符號
-        base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
-        # 2. 處理極少數的外來音，將其映射到最接近的英語音
-        if base_phoneme == 'β':
-            base_phoneme = 'v'
-        elif base_phoneme in ['x', 'ɣ', 'ɦ']:
-            base_phoneme = 'h'
-        # 根據需要可以增加更多規則，但這已經涵蓋了絕大部分
-        normalized_phonemes.append(base_phoneme)
-    return normalized_phonemes
-def load_model():
-    """
-    載入 KoelLabs 的 ASR 模型。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    try:
-        # 【【【【【 修改 #3：使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
-        processor = AutoProcessor.from_pretrained(MODEL_NAME)
-        model = AutoModelForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
-# --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
-MULTI_CHAR_PHONEMES = {
-    'tʃ', 'dʒ', # 輔音 (Affricates)
-    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
-    'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
-}
-def _tokenize_ipa(ipa_string: str) -> list:
-    """
-    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
-    """
-    phonemes = []
-    i = 0
-    s = ipa_string.replace(' ', '')
-    while i < len(s):
-        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
-            phonemes.append(s[i:i+2])
-            i += 2
-        else:
-            phonemes.append(s[i])
-            i += 1
-    return phonemes
-# --- 3. 核心分析函數 (主入口) (已修改以整合正規化器) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
-    """
-    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    這是此模組的主要進入點。
-    """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
-    # 您的原始邏輯，保持不變
-    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
-    # 您的原始邏輯，保持不變
-    target_ipa_by_word = [
-        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
-        for word in target_ipa_by_word_str
-    ]
-    target_words_original = target_sentence.split()
-    # 您的原始邏輯，保持不變
-    try:
-        speech, sample_rate = sf.read(audio_file_path)
-        if sample_rate != 16000:
-            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
-    except Exception as e:
-        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
-    # 您的原始邏輯，保持不變
-    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
-    input_values = input_values.to(DEVICE)
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # 【【【【【 修改 #4：在此處插入正規化步驟 】】】】】
-    # 1. 解碼得到原始的、帶有高級 IPA 的序列
-    raw_user_ipa_str = processor.decode(predicted_ids[0])
-    raw_user_phonemes = raw_user_ipa_str.split(' ')
-    # 2. 調用新的正規化函式進行清理
-    normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
-    # 3. 將清理後的音素列表重新組合成字串，以適應後續的 _tokenize_ipa 函式
-    user_ipa_full = "".join(normalized_user_phonemes)
-    # 後續所有邏輯都與您的原版完全相同
-    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
-    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
-def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
-    """
-    (已修改) 使用新的切分邏輯執行音素對齊。
-    """
-    user_phonemes = _tokenize_ipa(user_phoneme_str)
-    target_phonemes_flat = []
-    word_boundaries_indices = []
-    current_idx = 0
-    for word_ipa_tokens in target_words_ipa_tokenized:
-        target_phonemes_flat.extend(word_ipa_tokens)
-        current_idx += len(word_ipa_tokens)
-        word_boundaries_indices.append(current_idx - 1)
-    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
-    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
-    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
-    for i in range(1, len(user_phonemes) + 1):
-        for j in range(1, len(target_phonemes_flat) + 1):
-            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
-            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
-    i, j = len(user_phonemes), len(target_phonemes_flat)
-    user_path, target_path = [], []
-    while i > 0 or j > 0:
-        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
-        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
-        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
-        else:
-            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
-    alignments_by_word = []
-    word_start_idx_in_path = 0
-    target_phoneme_counter_in_path = 0
-    for path_idx, p in enumerate(target_path):
-        if p != '-':
-            if target_phoneme_counter_in_path in word_boundaries_indices:
-                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
-                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
-                alignments_by_word.append({
-                    "target": target_alignment,
-                    "user": user_alignment
-                })
-                word_start_idx_in_path = path_idx + 1
-            target_phoneme_counter_in_path += 1
-    return alignments_by_word
-# --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
-def _format_to_json_structure(alignments, sentence, original_words) -> dict:
-    total_phonemes = 0
-    total_errors = 0
-    correct_words_count = 0
-    words_data = []
-    num_words_to_process = min(len(alignments), len(original_words))
-    for i in range(num_words_to_process):
-        alignment = alignments[i]
-        word_is_correct = True
-        phonemes_data = []
-        for j in range(len(alignment['target'])):
-            target_phoneme = alignment['target'][j]
-            user_phoneme = alignment['user'][j]
-            is_match = (user_phoneme == target_phoneme)
-            phonemes_data.append({
-                "target": target_phoneme,
-                "user": user_phoneme,
-                "isMatch": is_match
-            })
-            if not is_match:
-                word_is_correct = False
-                if not (user_phoneme == '-' and target_phoneme == '-'):
-                    total_errors += 1
-        if word_is_correct:
-            correct_words_count += 1
-        words_data.append({
-            "word": original_words[i],
-            "isCorrect": word_is_correct,
-            "phonemes": phonemes_data
-        })
-        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    total_words = len(original_words)
-    if len(alignments) < total_words:
-        for i in range(len(alignments), total_words):
-            # 您的原始邏輯，保持不變
-            missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
-            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
-            phonemes_data = []
-            for p_ipa in missed_word_ipa:
-                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
-                total_errors += 1
-                total_phonemes += 1
-            words_data.append({
-                "word": original_words[i],
-                "isCorrect": False,
-                "phonemes": phonemes_data
-            })
-    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
-    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
-    final_result = {
-        "sentence": sentence,
-        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
-        "summary": {
-            "overallScore": round(overall_score, 1),
-            "totalWords": total_words,
-            "correctWords": correct_words_count,
-            "phonemeErrorRate": round(phoneme_error_rate, 2),
-            "total_errors": total_errors,
-            "total_target_phonemes": total_phonemes
-        },
-        "words": words_data
-    }
-    return final_result

+import torch
+import soundfile as sf
+import librosa
+# 【【【【【 修改 #1：從 transformers 匯入 AutoProcessor 和 AutoModelForCTC 】】】】】
+from transformers import AutoProcessor, AutoModelForCTC
+import os
+from phonemizer import phonemize
+import numpy as np
+from datetime import datetime, timezone
+# --- 全域設定 (已修改) ---
+# 移除了全域的 processor 和 model 變數。
+# 刪除了舊的 load_model() 函數。
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"INFO: ASR_en_us_v2.py is configured to use device: {DEVICE}")
+# 【【【【【 修改 #2：更新為最終選定的 KoelLabs 模型名稱 】】】】】
+MODEL_NAME = "KoelLabs/xlsr-english-01"
+# 【【【【【 新增程式碼 #1：為 KoelLabs 模型設計的 IPA 正規化器 】】】】】
+# 【保持不變】
+def normalize_koel_ipa(raw_phonemes: list) -> list:
+    """
+    將 KoelLabs 模型輸出的高級 IPA 序列，正規化為與 eSpeak 輸出可比的基礎 IPA 序列。
+    """
+    normalized_phonemes = []
+    for phoneme in raw_phonemes:
+        if not phoneme:
+            continue
+        base_phoneme = phoneme.replace('ʰ', '').replace('̃', '').replace('̥', '')
+        if base_phoneme == 'β':
+            base_phoneme = 'v'
+        elif base_phoneme in ['x', 'ɣ', 'ɦ']:
+            base_phoneme = 'h'
+        normalized_phonemes.append(base_phoneme)
+    return normalized_phonemes
+# --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
+MULTI_CHAR_PHONEMES = {
+    'tʃ', 'dʒ',
+    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ',
+    'ɪə', 'eə', 'ʊə', 'ər'
+}
+def _tokenize_ipa(ipa_string: str) -> list:
+    """
+    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
+    """
+    phonemes = []
+    i = 0
+    s = ipa_string.replace(' ', '')
+    while i < len(s):
+        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
+            phonemes.append(s[i:i+2])
+            i += 2
+        else:
+            phonemes.append(s[i])
+            i += 1
+    return phonemes
+# --- 3. 核心分析函數 (主入口) (已修改以整合正規化器和快取邏輯) ---
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
+    """
+    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
+    """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_en_us_v2)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 【【【【【 修改 #3：使用 AutoProcessor 和 AutoModelForCTC 載入模型 】】】】】
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
+            cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
+    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
+    target_ipa_by_word = [
+        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
+        for word in target_ipa_by_word_str
+    ]
+    target_words_original = target_sentence.split()
+    try:
+        speech, sample_rate = sf.read(audio_file_path)
+        if sample_rate != 16000:
+            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
+    except Exception as e:
+        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
+    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
+    input_values = input_values.to(DEVICE)
+    with torch.no_grad():
+        logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # 【【【【【 修改 #4：在此處插入正規化步驟 】】】】】
+    # 【保持不變】
+    raw_user_ipa_str = processor.decode(predicted_ids[0])
+    raw_user_phonemes = raw_user_ipa_str.split(' ')
+    normalized_user_phonemes = normalize_koel_ipa(raw_user_phonemes)
+    user_ipa_full = "".join(normalized_user_phonemes)
+    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
+    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
+# --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
+def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
+    """
+    (已修改) 使用新的切分邏輯執行音素對齊。
+    """
+    user_phonemes = _tokenize_ipa(user_phoneme_str)
+    target_phonemes_flat = []
+    word_boundaries_indices = []
+    current_idx = 0
+    for word_ipa_tokens in target_words_ipa_tokenized:
+        target_phonemes_flat.extend(word_ipa_tokens)
+        current_idx += len(word_ipa_tokens)
+        word_boundaries_indices.append(current_idx - 1)
+    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
+    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
+    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
+    for i in range(1, len(user_phonemes) + 1):
+        for j in range(1, len(target_phonemes_flat) + 1):
+            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
+            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
+    i, j = len(user_phonemes), len(target_phonemes_flat)
+    user_path, target_path = [], []
+    while i > 0 or j > 0:
+        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
+        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
+        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
+        else:
+            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
+    alignments_by_word = []
+    word_start_idx_in_path = 0
+    target_phoneme_counter_in_path = 0
+    for path_idx, p in enumerate(target_path):
+        if p != '-':
+            if target_phoneme_counter_in_path in word_boundaries_indices:
+                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
+                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
+                alignments_by_word.append({
+                    "target": target_alignment,
+                    "user": user_alignment
+                })
+                word_start_idx_in_path = path_idx + 1
+            target_phoneme_counter_in_path += 1
+    return alignments_by_word
+# --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
+# 【保持不變】
+def _format_to_json_structure(alignments, sentence, original_words) -> dict:
+    total_phonemes = 0
+    total_errors = 0
+    correct_words_count = 0
+    words_data = []
+    num_words_to_process = min(len(alignments), len(original_words))
+    for i in range(num_words_to_process):
+        alignment = alignments[i]
+        word_is_correct = True
+        phonemes_data = []
+        for j in range(len(alignment['target'])):
+            target_phoneme = alignment['target'][j]
+            user_phoneme = alignment['user'][j]
+            is_match = (user_phoneme == target_phoneme)
+            phonemes_data.append({
+                "target": target_phoneme,
+                "user": user_phoneme,
+                "isMatch": is_match
+            })
+            if not is_match:
+                word_is_correct = False
+                if not (user_phoneme == '-' and target_phoneme == '-'):
+                    total_errors += 1
+        if word_is_correct:
+            correct_words_count += 1
+        words_data.append({
+            "word": original_words[i],
+            "isCorrect": word_is_correct,
+            "phonemes": phonemes_data
+        })
+        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
+    total_words = len(original_words)
+    if len(alignments) < total_words:
+        for i in range(len(alignments), total_words):
+            missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
+            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
+            phonemes_data = []
+            for p_ipa in missed_word_ipa:
+                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
+                total_errors += 1
+                total_phonemes += 1
+            words_data.append({
+                "word": original_words[i],
+                "isCorrect": False,
+                "phonemes": phonemes_data
+            })
+    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
+    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
+    final_result = {
+        "sentence": sentence,
+        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
+        "summary": {
+            "overallScore": round(overall_score, 1),
+            "totalWords": total_words,
+            "correctWords": correct_words_count,
+            "phonemeErrorRate": round(phoneme_error_rate, 2),
+            "total_errors": total_errors,
+            "total_target_phonemes": total_phonemes
+        },
+        "words": words_data
+    }
+    return final_result

analyzer/ASR_en_us_v3.py DELETED Viewed

@@ -1,320 +0,0 @@
-# ASR_en_us_v3.py
-import torch
-import soundfile as sf
-import librosa
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import os
-from phonemizer import phonemize
-import numpy as np
-from datetime import datetime, timezone
-# --- 全域設定 ---
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"INFO: ASR_en_us_v3.py is configured to use device: {DEVICE}")
-# 【【【【【 關鍵修改 #1：更新為最終選定的模型名稱 】】】】】
-MODEL_NAME = "facebook/wav2vec2-lv-60-espeak-cv-ft"
-processor = None
-model = None
-# 【【【【【 新增程式碼 #1：IPA 淨化器相關的字典 】】】】】
-# 步驟 1a：定義一個權威的、我們認可的「標準美式英語 IPA 符號集」
-# 這個集合是我們的「白名單」
-VALID_ENGLISH_IPA = {
-    # 元音 (Vowels)
-    'i', 'ɪ', 'e', 'ɛ', 'æ', 'a', 'ɑ', 'ɔ', 'o', 'ʊ', 'u', 'ʌ', 'ə', 'ɐ', 'ᵻ',
-    # R音化元音 (R-colored Vowels)
-    'ɚ', 'ɝ',
-    # 雙元音 (Diphthongs)
-    'aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ', 'iə', 'eə', 'ʊə', 'ɛɹ', 'ɪɹ', 'ʊɹ', 'aɪɚ', 'aɪə',
-    # 輔音 (Consonants)
-    'p', 'b', 't', 'd', 'k', 'ɡ', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'm', 'n', 'ŋ', 'l', 'ɹ', 'w', 'j',
-    # 塞擦音 (Affricates)
-    'tʃ', 'dʒ',
-    # 其他常見變體
-    'ɾ', 'ʔ', 'ɫ', 'n̩', 'l̩', 'r̩'
-}
-# 步驟 1b：建立「外語到英語」的映射規則字典
-# 這是我們的「重點觀察名單」或「黑名單轉換規則」
-NON_ENGLISH_TO_ENGLISH_MAP = {
-    # 歐洲語言常見變體
-    'ʁ': 'ɹ', 'r': 'ɹ', 'β': 'v', 'x': 'h', 'ɣ': 'ɡ', 'ç': 'h', 'y': 'i', 'ø': 'e', 'œ': 'ɛ', 'ɒ': 'ɑ', 'əʊ': 'oʊ',
-    # 鼻化元音 (去掉鼻化)
-    'ɑ̃': 'ɑ', 'ɔ̃': 'ɔ', 'ɛ̃': 'ɛ', 'œ̃': 'ɛ', 'ɐ̃': 'ɐ', 'õ': 'o', 'ĩ': 'i', 'ũ': 'u',
-    # 亞洲/斯拉夫語系常見音 (映射到最接近的英語音)
-    'ɕ': 'ʃ', 'tɕ': 'tʃ', 'ʂ': 'ʃ', 'ʐ': 'ʒ', 'dʑ': 'dʒ',
-    # 印地語捲舌音 (去掉捲舌特徵)
-    'ʈ': 't', 'ɖ': 'd', 'ɳ': 'n', 'ɭ': 'l', 'ɽ': 'ɾ',
-    # 阿拉伯語系音
-    'ʕ': 'ʔ', 'ħ': 'h', 'q': 'k',
-    # 其他...
-    'ʎ': 'j', 'ɲ': 'n', 'ʋ': 'v', 'c': 'k', 'ɟ': 'ɡ', 'ɸ': 'f', 'χ': 'h',
-}
-def load_model():
-    """
-    載入 Facebook 的 Wav2Vec2 espeak ASR 模型。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    try:
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
-# 【【【【【 新增程式碼 #2：IPA 淨化器函式 】】】】】
-def purify_ipa_sequence(raw_phonemes: list) -> list:
-    """
-    將一個可能包含外語 IPA 的音素序列，淨化為只包含標準英語 IPA 的序列。
-    """
-    purified_phonemes = []
-    for phoneme in raw_phonemes:
-        if not phoneme:  # 跳過空字串
-            continue
-        # 1. 如果音素本身就是合法的英語 IPA，直接接受
-        if phoneme in VALID_ENGLISH_IPA:
-            purified_phonemes.append(phoneme)
-            continue
-        # 2. 如果音素在我們的映射字典中，進行替換
-        if phoneme in NON_ENGLISH_TO_ENGLISH_MAP:
-            replacement = NON_ENGLISH_TO_ENGLISH_MAP[phoneme]
-            purified_phonemes.append(replacement)
-            # print(f"INFO: Replaced non-English IPA '{phoneme}' with '{replacement}'.") # 可選的日誌
-            continue
-        # 3. 處理帶有附加符號的音素 (例如長音 'ː', 顎化 'ʲ')
-        # 簡化處理：直接去掉附加符號，看剩下的部分是否合法
-        base_phoneme = phoneme.replace('ː', '').replace('ʲ', '').replace('ʰ', '')
-        if base_phoneme in VALID_ENGLISH_IPA:
-            purified_phonemes.append(base_phoneme)
-            # print(f"INFO: Stripped diacritics from '{phoneme}' to '{base_phoneme}'.") # 可選的日誌
-            continue
-        # 4. 如果經過以上所有步驟仍然無法識別，作為最後手段，忽略該音素
-        # print(f"WARNING: Unknown IPA phoneme '{phoneme}' encountered and was ignored.") # 可選的日誌
-    return purified_phonemes
-# --- 2. 智能 IPA 切分函數 (與您的原版邏輯完全相同) ---
-MULTI_CHAR_PHONEMES = {
-    'tʃ', 'dʒ', 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', 'ɪə', 'eə', 'ʊə', 'ər',
-    # 為 Facebook 模型輸出新增的組合
-    'ɑː', 'iː', 'uː', 'ɔː', 'ɜː', 'oː', 'eː', 'yː', 'øː', 'œː', 'ɛː', 'æː',
-    'ɑːɹ', 'ɔːɹ', 'oːɹ', 'ɛɹ', 'ɪɹ', 'ʊɹ', 'aɪɚ', 'aɪə'
-}
-def _tokenize_ipa(ipa_string: str) -> list:
-    """
-    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
-    """
-    phonemes = []
-    i = 0
-    s = ipa_string.replace(' ', '')
-    while i < len(s):
-        # 優先檢查三個字符的組合 (例如 ɑːɹ)
-        if i + 2 < len(s) and s[i:i+3] in MULTI_CHAR_PHONEMES:
-            phonemes.append(s[i:i+3])
-            i += 3
-        # 再檢查兩個字符的組合
-        elif i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
-            phonemes.append(s[i:i+2])
-            i += 2
-        else:
-            phonemes.append(s[i])
-            i += 1
-    return phonemes
-# --- 3. 核心分析函數 (主入口) (已修改以整合淨化器) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
-    """
-    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
-    # 步驟 1：獲取目標 IPA (與原版邏輯相同)
-    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
-    # 【【【【【 關鍵修改 #2：完全遵循您對目標 IPA 的清理邏輯 】】】】】
-    # 在切分前，移除所有重音和長音符號
-    target_ipa_by_word = [
-        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
-        for word in target_ipa_by_word_str
-    ]
-    target_words_original = target_sentence.split()
-    # 步驟 2：讀取和重採樣音訊 (與原版邏輯相同)
-    try:
-        speech, sample_rate = sf.read(audio_file_path)
-        if sample_rate != 16000:
-            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
-    except Exception as e:
-        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
-    # 步驟 3：使用 Wav2Vec2 模型進行預測
-    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
-    input_values = input_values.to(DEVICE)
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # 步驟 4：解碼得到原始的、可能混雜的音素序列
-    raw_user_ipa_str = processor.batch_decode(predicted_ids[0])[0]
-    raw_user_phonemes = raw_user_ipa_str.split(' ')
-    # 【【【【【 關鍵修改 #3：在此處插入淨化步驟 】】】】】
-    purified_user_phonemes = purify_ipa_sequence(raw_user_phonemes)
-    user_ipa_full = "".join(purified_user_phonemes)
-    # 步驟 5：使用淨化後的 IPA 進行音素對齊 (後續邏輯與原版完全相同)
-    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
-    # 步驟 6：格式化為最終的 JSON 結構 (與原版邏輯完全相同)
-    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (與您的原版邏輯完全相同) ---
-def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
-    user_phonemes = _tokenize_ipa(user_phoneme_str)
-    target_phonemes_flat = []
-    word_boundaries_indices = []
-    current_idx = 0
-    for word_ipa_tokens in target_words_ipa_tokenized:
-        target_phonemes_flat.extend(word_ipa_tokens)
-        current_idx += len(word_ipa_tokens)
-        word_boundaries_indices.append(current_idx - 1)
-    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
-    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
-    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
-    for i in range(1, len(user_phonemes) + 1):
-        for j in range(1, len(target_phonemes_flat) + 1):
-            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
-            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
-    i, j = len(user_phonemes), len(target_phonemes_flat)
-    user_path, target_path = [], []
-    while i > 0 or j > 0:
-        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
-        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
-        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
-        else:
-            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
-    alignments_by_word = []
-    word_start_idx_in_path = 0
-    target_phoneme_counter_in_path = 0
-    for path_idx, p in enumerate(target_path):
-        if p != '-':
-            if target_phoneme_counter_in_path in word_boundaries_indices:
-                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
-                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
-                alignments_by_word.append({
-                    "target": target_alignment,
-                    "user": user_alignment
-                })
-                word_start_idx_in_path = path_idx + 1
-            target_phoneme_counter_in_path += 1
-    return alignments_by_word
-# --- 5. 格式化函數 (與您的原版邏輯完全相同) ---
-def _format_to_json_structure(alignments, sentence, original_words) -> dict:
-    total_phonemes = 0
-    total_errors = 0
-    correct_words_count = 0
-    words_data = []
-    num_words_to_process = min(len(alignments), len(original_words))
-    for i in range(num_words_to_process):
-        alignment = alignments[i]
-        word_is_correct = True
-        phonemes_data = []
-        for j in range(len(alignment['target'])):
-            target_phoneme = alignment['target'][j]
-            user_phoneme = alignment['user'][j]
-            is_match = (user_phoneme == target_phoneme)
-            phonemes_data.append({
-                "target": target_phoneme,
-                "user": user_phoneme,
-                "isMatch": is_match
-            })
-            if not is_match:
-                word_is_correct = False
-                if not (user_phoneme == '-' and target_phoneme == '-'):
-                    total_errors += 1
-        if word_is_correct:
-            correct_words_count += 1
-        words_data.append({
-            "word": original_words[i],
-            "isCorrect": word_is_correct,
-            "phonemes": phonemes_data
-        })
-        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    total_words = len(original_words)
-    if len(alignments) < total_words:
-        for i in range(len(alignments), total_words):
-            # 【【【【【 關鍵修改 #4：完全遵循您對遺漏單詞的清理邏輯 】】】】】
-            missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
-            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
-            phonemes_data = []
-            for p_ipa in missed_word_ipa:
-                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
-                total_errors += 1
-                total_phonemes += 1
-            words_data.append({
-                "word": original_words[i],
-                "isCorrect": False,
-                "phonemes": phonemes_data
-            })
-    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
-    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
-    final_result = {
-        "sentence": sentence,
-        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
-        "summary": {
-            "overallScore": round(overall_score, 1),
-            "totalWords": total_words,
-            "correctWords": correct_words_count,
-            "phonemeErrorRate": round(phoneme_error_rate, 2),
-            "total_errors": total_errors,
-            "total_target_phonemes": total_phonemes
-        },
-        "words": words_data
-    }
-    return final_result

analyzer/ASR_fr_fr.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# ASR_fr_fr.py
 import torch
 import soundfile as sf
 import librosa
@@ -17,86 +15,66 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
 # --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
 MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
-processor = None
-model = None
-def load_model():
-    """
-    (方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
-    它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
-    try:
-        # 直接使用模型的線上名稱調用 from_pretrained
-        # 這就是魔法發生的地方！
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
 def _tokenize_unicode_ipa(ipa_string: str) -> list:
     """
     智能地切分包含 Unicode 組合字元的 IPA 字串。
     """
     phonemes = []
-    # 移除所有空格
     s = ipa_string.replace(' ', '')
     i = 0
     while i < len(s):
-        # 獲取當前字元
         current_char = s[i]
         i += 1
-        # 檢查後續是否有連續的組合字元
-        while i < len(s) and unicodedata.category(s[i]) == 'Mn': # 'Mn' 代表非間距標記 (Non-Spacing Mark)
             current_char += s[i]
             i += 1
         phonemes.append(current_char)
     return phonemes
 # --- 2. 核心分析函數 (主入口) (已修改為法語邏輯) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    這是此模組的主要進入點。
     """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
-    # 【【【【【 關鍵修改 1：更智能地處理原始句子 】】】】】
-    # 使用正則表達式來準確地分割單詞，並自動忽略標點符號
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
-    # 將分割好的、乾淨的單詞重新組合，再傳給 phonemize
     cleaned_sentence = " ".join(target_words_original)
-    # 使用 espeak 獲取法語目標音素
     epi_fr = epitran.Epitran('fra-Latn')
     target_ipa_full = epi_fr.transliterate(cleaned_sentence)
     target_ipa_by_word_str = target_ipa_full.split()
-    # 【【【【【 確保兩個列表長度一致 】】】】】
     if len(target_ipa_by_word_str) != len(target_words_original):
         target_words_original = target_words_original[:len(target_ipa_by_word_str)]
-    # 對於法語，我們將特殊符號移除，並使用簡單的字元切分
     target_ipa_by_word = [
         _tokenize_unicode_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('‿', '').replace("'", ""))
         for word in target_ipa_by_word_str
     ]
-    # target_words_original 已經在上面被正確賦值了
     try:
         speech, sample_rate = sf.read(audio_file_path)
@@ -122,7 +100,6 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     """
     執行音素對齊���對法語使用簡單的字元切分。
     """
-    # 對於 user 的音素字串，也使用簡單的字元切分
     user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
     target_phonemes_flat = []
@@ -217,7 +194,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     total_words = len(original_words)
     if len(alignments) < total_words:
         for i in range(len(alignments), total_words):
-            # 確保這裡也移除相關符號
             missed_word_ipa_str = phonemize(original_words[i], language='fr-fr', backend='espeak', strip=True).replace('ˈ', '').replace('ˌ', '').replace('‿', '')
             missed_word_ipa = _tokenize_unicode_ipa(missed_word_ipa_str)
             phonemes_data = []
@@ -249,4 +225,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
         "words": words_data
     }
-    return final_result

 import torch
 import soundfile as sf
 import librosa
 print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
 # --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
+# 移除了全域的 processor 和 model 變數，只保留常數。
+# 刪除了舊的 load_model() 函數。
 MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
 def _tokenize_unicode_ipa(ipa_string: str) -> list:
     """
     智能地切分包含 Unicode 組合字元的 IPA 字串。
     """
     phonemes = []
     s = ipa_string.replace(' ', '')
     i = 0
     while i < len(s):
         current_char = s[i]
         i += 1
+        while i < len(s) and unicodedata.category(s[i]) == 'Mn':
             current_char += s[i]
             i += 1
         phonemes.append(current_char)
     return phonemes
 # --- 2. 核心分析函數 (主入口) (已修改為法語邏輯) ---
+# 將模型載入和快取邏輯合併至此。
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_fr_fr)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
     cleaned_sentence = " ".join(target_words_original)
     epi_fr = epitran.Epitran('fra-Latn')
     target_ipa_full = epi_fr.transliterate(cleaned_sentence)
     target_ipa_by_word_str = target_ipa_full.split()
     if len(target_ipa_by_word_str) != len(target_words_original):
         target_words_original = target_words_original[:len(target_ipa_by_word_str)]
     target_ipa_by_word = [
         _tokenize_unicode_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('‿', '').replace("'", ""))
         for word in target_ipa_by_word_str
     ]
     try:
         speech, sample_rate = sf.read(audio_file_path)
     """
     執行音素對齊���對法語使用簡單的字元切分。
     """
     user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
     target_phonemes_flat = []
     total_words = len(original_words)
     if len(alignments) < total_words:
         for i in range(len(alignments), total_words):
             missed_word_ipa_str = phonemize(original_words[i], language='fr-fr', backend='espeak', strip=True).replace('ˈ', '').replace('ˌ', '').replace('‿', '')
             missed_word_ipa = _tokenize_unicode_ipa(missed_word_ipa_str)
             phonemes_data = []
         "words": words_data
     }
+    return final_result

analyzer/ASR_jp_jp.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# ASR_jp_jp.py
 # =======================================================================
 # 1. 匯入區 (Imports)
 #    - 新增了 pyopenjtalk 和 MeCab
@@ -17,6 +15,7 @@ import re
 # =======================================================================
 # 2. 全域變數與配置區 (Global Variables & Config)
 # =======================================================================
 # 自動檢測可用設備
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -25,9 +24,6 @@ print(f"INFO: ASR_jp_jp.py is configured to use device: {DEVICE}")
 # 設定為日語 ASR 模型
 MODEL_NAME = "prj-beatrice/japanese-hubert-base-phoneme-ctc-v3"
-processor = None
-model = None
 # 初始化 MeCab 分詞器
 # -Owakati 選項能直接輸出以空格分隔的單詞，非常方便
 try:
@@ -42,30 +38,12 @@ except RuntimeError:
 # -----------------------------------------------------------------------
 # 3.1. 模型載入函數
-#      - 將 Wav2Vec2ForCTC 更換為 HubertForCTC
 # -----------------------------------------------------------------------
-def load_model():
-    """
-    載入日語 ASR 模型 (HubertForCTC) 和對應的處理器。
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    try:
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = HubertForCTC.from_pretrained(MODEL_NAME) # <-- 使用 HubertForCTC
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
 # -----------------------------------------------------------------------
 # 3.2. 日語 G2P 輔助函數 (此檔案最核心的修改)
 # -----------------------------------------------------------------------
 def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
     if not mecab_tagger:
@@ -82,8 +60,6 @@ def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]
         phonemes_str = pyopenjtalk.g2p(word, kana=False)
-        # 【最終修正】完全不清理任何音素，直接使用原始輸出
-        # 只做基本的空格標準化
         cleaned_phonemes = re.sub(r'\s+', ' ', phonemes_str).strip()
         phoneme_list = cleaned_phonemes.split()
@@ -96,6 +72,7 @@ def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]
 # -----------------------------------------------------------------------
 # 3.3. 音素切分函數 (用於處理 ASR 的輸出)
 # -----------------------------------------------------------------------
 def _tokenize_asr_output(phoneme_string: str) -> list:
     """
@@ -106,26 +83,40 @@ def _tokenize_asr_output(phoneme_string: str) -> list:
 # -----------------------------------------------------------------------
 # 3.4. 核心分析函數 (主入口)
 # -----------------------------------------------------------------------
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
     """
     接收音訊檔案路徑和目標日語句子，回傳詳細的發音分析字典。
     """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
     # 【關鍵步驟 1: G2P】
-    # 使用新的 G2P 函數獲取目標單詞和音素
     target_words_original, target_ipa_by_word = _get_target_phonemes_by_word(target_sentence)
-    # 處理音訊檔案為空或句子為空的邊界情況
     if not target_words_original:
         print("警告: G2P 處理後目標句子為空。")
-        # 建立一個空的骨架結構返回
         return _format_to_json_structure([], target_sentence, [])
     # 【關鍵步驟 2: ASR】
-    # 載入並處理音訊
     try:
         speech, sample_rate = sf.read(audio_file_path)
         if len(speech) == 0:
@@ -135,7 +126,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
             if sample_rate != 16000:
                 speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
-            # 進行 ASR 推論
             input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
             input_values = input_values.to(DEVICE)
             with torch.no_grad():
@@ -147,50 +137,34 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
         raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
     # 【關鍵步驟 3: 對齊】
-    # 執行音素對齊
     word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
     # 【關鍵步驟 4: 格式化】
-    # 格式化為最終的 JSON 輸出
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
 # =======================================================================
 # 4. 對齊與格式化函數區 (Alignment & Formatting)
-#    【注意】這些函數是語言無關的，直接從 en_us/fr_fr 版本複製而來。
 # =======================================================================
 # -----------------------------------------------------------------------
 # 4.1. 對齊函數 (語言無關)
 # -----------------------------------------------------------------------
-# 【【【【【 最終的、決定性的日文版邏輯修正 】】】】】
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。此函數是語言無關的。
     """
-    # 【【【【【 關鍵修改 】】】】】
-    # 舊的錯誤做法：user_phonemes = user_phoneme_str.split()
-    # 這只會得到 ['a', 'sh', 'i', 't', 'a'] 這樣的列表。
-    # 新的正確做法：
-    # 1. 先按空格分割成 "音素單詞"。
-    # 2. 再將每個 "音素單詞" 徹底地展開成單個音素字元。
-    # 例如，"a sh i t a" -> ['a', 'sh', 'i', 't', 'a'] -> ['a', 's', 'h', 'i', 't', 'a']
-    # 這與英文版的 _tokenize_ipa() 達成了相同的效果：在對齊前就切分到最小單元。
     user_phonemes = [char for word in user_phoneme_str.split() for char in word]
-    # --- 後續的對齊邏輯完全保持不變 ---
     target_phonemes_flat = []
     word_boundaries_indices = []
     current_idx = 0
     for word_ipa_tokens in target_words_ipa_tokenized:
-        # 對於 target，我們也需要確保它是最小單元
         flat_tokens = [char for word in word_ipa_tokens for char in word]
         target_phonemes_flat.extend(flat_tokens)
         current_idx += len(flat_tokens)
         word_boundaries_indices.append(current_idx - 1)
-    # 如果目標音素為空，返回空對齊
     if not target_phonemes_flat:
         return []
@@ -261,7 +235,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
         word_is_correct = True
         phonemes_data = []
-        # 確保 alignment['target'] 和 alignment['user'] 長度相同
         min_len = min(len(alignment['target']), len(alignment['user']))
         for j in range(min_len):
             target_phoneme = alignment['target'][j]
@@ -276,7 +249,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
             if not is_match:
                 word_is_correct = False
-                # 只有在 target 和 user 不都為 '-' 時才算作錯誤
                 if not (user_phoneme == '-' and target_phoneme == '-'):
                     total_errors += 1
@@ -291,14 +263,12 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
         total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    # 【Fuse Logic】處理 ASR 結果比目標單詞少的情況 (使用者漏講了單詞)
     if len(alignments) < len(original_words):
         for i in range(len(alignments), len(original_words)):
-            # 重新獲取漏掉單詞的音素
             _, missed_word_ipa_list = _get_target_phonemes_by_word(original_words[i])
             phonemes_data = []
-            if missed_word_ipa_list: # 確保列表不是空的
                 for p_ipa in missed_word_ipa_list[0]:
                     phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
                     total_errors += 1

 # =======================================================================
 # 1. 匯入區 (Imports)
 #    - 新增了 pyopenjtalk 和 MeCab
 # =======================================================================
 # 2. 全域變數與配置區 (Global Variables & Config)
+#    【已修改】移除了全域的 processor 和 model 變數。
 # =======================================================================
 # 自動檢測可用設備
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # 設定為日語 ASR 模型
 MODEL_NAME = "prj-beatrice/japanese-hubert-base-phoneme-ctc-v3"
 # 初始化 MeCab 分詞器
 # -Owakati 選項能直接輸出以空格分隔的單詞，非常方便
 try:
 # -----------------------------------------------------------------------
 # 3.1. 模型載入函數
+#      【已刪除】舊的 load_model() 函數已被移除。
 # -----------------------------------------------------------------------
 # -----------------------------------------------------------------------
 # 3.2. 日語 G2P 輔助函數 (此檔案最核心的修改)
+#      【保持不變】
 # -----------------------------------------------------------------------
 def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
     if not mecab_tagger:
         phonemes_str = pyopenjtalk.g2p(word, kana=False)
         cleaned_phonemes = re.sub(r'\s+', ' ', phonemes_str).strip()
         phoneme_list = cleaned_phonemes.split()
 # -----------------------------------------------------------------------
 # 3.3. 音素切分函數 (用於處理 ASR 的輸出)
+#      【保持不變】
 # -----------------------------------------------------------------------
 def _tokenize_asr_output(phoneme_string: str) -> list:
     """
 # -----------------------------------------------------------------------
 # 3.4. 核心分析函數 (主入口)
+#      【已修改】將模型載入和快取邏輯合併至此。
 # -----------------------------------------------------------------------
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標日語句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_jp_jp)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = HubertForCTC.from_pretrained(MODEL_NAME) # <-- 使用 HubertForCTC
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
     # 【關鍵步驟 1: G2P】
     target_words_original, target_ipa_by_word = _get_target_phonemes_by_word(target_sentence)
     if not target_words_original:
         print("警告: G2P 處理後目標句子為空。")
         return _format_to_json_structure([], target_sentence, [])
     # 【關鍵步驟 2: ASR】
     try:
         speech, sample_rate = sf.read(audio_file_path)
         if len(speech) == 0:
             if sample_rate != 16000:
                 speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
             input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
             input_values = input_values.to(DEVICE)
             with torch.no_grad():
         raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
     # 【關鍵步驟 3: 對齊】
     word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
     # 【關鍵步驟 4: 格式化】
     return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
 # =======================================================================
 # 4. 對齊與格式化函數區 (Alignment & Formatting)
+#    【保持不變】
 # =======================================================================
 # -----------------------------------------------------------------------
 # 4.1. 對齊函數 (語言無關)
 # -----------------------------------------------------------------------
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。此函數是語言無關的。
     """
     user_phonemes = [char for word in user_phoneme_str.split() for char in word]
     target_phonemes_flat = []
     word_boundaries_indices = []
     current_idx = 0
     for word_ipa_tokens in target_words_ipa_tokenized:
         flat_tokens = [char for word in word_ipa_tokens for char in word]
         target_phonemes_flat.extend(flat_tokens)
         current_idx += len(flat_tokens)
         word_boundaries_indices.append(current_idx - 1)
     if not target_phonemes_flat:
         return []
         word_is_correct = True
         phonemes_data = []
         min_len = min(len(alignment['target']), len(alignment['user']))
         for j in range(min_len):
             target_phoneme = alignment['target'][j]
             if not is_match:
                 word_is_correct = False
                 if not (user_phoneme == '-' and target_phoneme == '-'):
                     total_errors += 1
         total_phonemes += sum(1 for p in alignment['target'] if p != '-')
     if len(alignments) < len(original_words):
         for i in range(len(alignments), len(original_words)):
             _, missed_word_ipa_list = _get_target_phonemes_by_word(original_words[i])
             phonemes_data = []
+            if missed_word_ipa_list:
                 for p_ipa in missed_word_ipa_list[0]:
                     phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
                     total_errors += 1

analyzer/ASR_nl_nl.py CHANGED Viewed

@@ -20,40 +20,17 @@ import unicodedata # 【保留】這是處理多語言音素的更優方案
 import re # 【保留】用於更準確地切分單詞
 # --- 2. 全域設定與模型載入 ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
 # 【關鍵修改 1：設定為荷蘭語 ASR 模型】
 MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
-processor = None
-model = None
-def load_model():
-    """
-    載入荷蘭語 ASR 模型和對應的處理器。
-    (此函數邏輯與 en_us.py 完全相同)
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    try:
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
 # --- 3. 智能 IPA 切分函數 ---
 # 【關鍵修改 2：保留更優越的通用切分邏輯】
-# 雖然此函數的實現比英文版的更複雜，但它更健壯且適用於包括荷蘭語在內的多種語言。
-# 這是為了「fit with Dutch」而必須保留的優化。
 def _tokenize_ipa(ipa_string: str) -> list:
     """
     將 IPA 字串智能地切分為音素列表，能正確處理帶有附加符號的組合字符。
@@ -64,7 +41,6 @@ def _tokenize_ipa(ipa_string: str) -> list:
     while i < len(s):
         current_char = s[i]
         i += 1
-        # 檢查並組合後續的非間距標記 (例如變音符)
         while i < len(s) and unicodedata.category(s[i]) == 'Mn':
             current_char += s[i]
             i += 1
@@ -72,16 +48,31 @@ def _tokenize_ipa(ipa_string: str) -> list:
     return phonemes
 # --- 4. 核心分析函數 (主入口) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
     """
     接收音訊檔案路徑和目標荷蘭語句子，回傳詳細的發音分析字典。
-    (此函數結構與 en_us.py 完全對齊)
     """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
     # 1. 準備目標音素 (G2P)
-    # 使用正則表達式準確切分單詞，這比簡單的 .split() 更穩健
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
     cleaned_sentence = " ".join(target_words_original)
@@ -94,7 +85,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
         strip=True
     ).split()
-    # 健壯性檢查：確保單詞和音素列表長度一致
     if len(target_words_original) != len(target_ipa_by_word_str):
         print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
         min_len = min(len(target_words_original), len(target_ipa_by_word_str))
@@ -122,7 +112,6 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
     predicted_ids = torch.argmax(logits, dim=-1)
     # 【關鍵修改 5：與 en_us.py 對齊，假設模型輸出是乾淨的，或在必要時清理】
-    # 移除模型可能產生的分隔符 |，並確保也移除長音符號，以匹配目標音素的處理方式
     user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
     # 3. 執行對齊並格式化輸出
@@ -131,6 +120,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
 # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。
@@ -157,16 +147,12 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     i, j = len(user_phonemes), len(target_phonemes_flat)
     user_path, target_path = [], []
     while i > 0 or j > 0:
-        # 使用與 en_us.py 相同的、更簡潔的回溯邏輯
         cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
-        # 優先匹配/替換
         if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
             user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
-        # 其次是刪除 (user 多)
         elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
             user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
-        # 最後是插入 (target 多)
         else:
             user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
@@ -192,6 +178,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     return alignments_by_word
 # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     """
     將對齊結果格式化為最終的 JSON 結構。
@@ -222,7 +209,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
             if not is_match:
                 word_is_correct = False
-                # 只有在不是「目標和用戶都為空」的情況下才計為錯誤
                 if not (user_phoneme == '-' and target_phoneme == '-'):
                     total_errors += 1
@@ -237,7 +223,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
         total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    # 處理使用者漏講單詞的情況
     if len(alignments) < len(original_words):
         for i in range(len(alignments), len(original_words)):
             # 【關鍵修改 6：確保此處的 G2P 語言和符號清理也保持一致】

 import re # 【保留】用於更準確地切分單詞
 # --- 2. 全域設定與模型載入 ---
+#    【已修改】移除了全域的 processor 和 model 變數。
+#    【已修改】刪除了舊的 load_model() 函數。
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
 # 【關鍵修改 1：設定為荷蘭語 ASR 模型】
 MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
 # --- 3. 智能 IPA 切分函數 ---
 # 【關鍵修改 2：保留更優越的通用切分邏輯】
+# 【保持不變】
 def _tokenize_ipa(ipa_string: str) -> list:
     """
     將 IPA 字串智能地切分為音素列表，能正確處理帶有附加符號的組合字符。
     while i < len(s):
         current_char = s[i]
         i += 1
         while i < len(s) and unicodedata.category(s[i]) == 'Mn':
             current_char += s[i]
             i += 1
     return phonemes
 # --- 4. 核心分析函數 (主入口) ---
+#    【已修改】將模型載入和快取邏輯合併至此。
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標荷蘭語句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_nl_nl)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
     # 1. 準備目標音素 (G2P)
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
     cleaned_sentence = " ".join(target_words_original)
         strip=True
     ).split()
     if len(target_words_original) != len(target_ipa_by_word_str):
         print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
         min_len = min(len(target_words_original), len(target_ipa_by_word_str))
     predicted_ids = torch.argmax(logits, dim=-1)
     # 【關鍵修改 5：與 en_us.py 對齊，假設模型輸出是乾淨的，或在必要時清理】
     user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
     # 3. 執行對齊並格式化輸出
 # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
+# 【保持不變】
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。
     i, j = len(user_phonemes), len(target_phonemes_flat)
     user_path, target_path = [], []
     while i > 0 or j > 0:
         cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
         if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
             user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
         elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
             user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
         else:
             user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
     return alignments_by_word
 # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
+# 【保持不變】
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     """
     將對齊結果格式化為最終的 JSON 結構。
             if not is_match:
                 word_is_correct = False
                 if not (user_phoneme == '-' and target_phoneme == '-'):
                     total_errors += 1
         total_phonemes += sum(1 for p in alignment['target'] if p != '-')
     if len(alignments) < len(original_words):
         for i in range(len(alignments), len(original_words)):
             # 【關鍵修改 6：確保此處的 G2P 語言和符號清理也保持一致】

analyzer/ASR_pt_br.py CHANGED Viewed

@@ -20,40 +20,17 @@ import unicodedata # 【保留】這是處理葡萄牙語鼻音等音素的更
 import re # 【保留】用於更準確地切分單詞
 # --- 2. 全域設定與模型載入 ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
 # 【關鍵修改 1：設定為葡萄牙語 ASR 模型】
 MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
-processor = None
-model = None
-def load_model():
-    """
-    載入葡萄牙語 ASR 模型和對應的處理器。
-    (此函數邏輯與 en_us.py 完全相同)
-    """
-    global processor, model
-    if processor and model:
-        print(f"模型 '{MODEL_NAME}' 已載入，跳過。")
-        return True
-    print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
-    try:
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-        model.to(DEVICE)
-        print(f"模型 '{MODEL_NAME}' 和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
 # --- 3. 智能 IPA 切分函數 ---
 # 【關鍵修改 2：保留更優越的通用切分邏輯】
-# 為了正確處理葡萄牙語的鼻化元音 (如 ɐ̃) 和塞擦音 (如 dʒ)，
-# 必須保留這個比英文版更強大的切分函數。
 def _tokenize_ipa(ipa_string: str) -> list:
     """
     將 IPA 字串智能地切分為音素列表，能正確處理帶有附加符號的組合字符。
@@ -62,13 +39,11 @@ def _tokenize_ipa(ipa_string: str) -> list:
     s = ipa_string.replace(' ', '')
     i = 0
     while i < len(s):
-        # 優先處理葡萄牙語中常見的雙字符塞擦音
         if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
             phonemes.append(s[i:i+2])
             i += 2
             continue
-        # 處理基礎字符及其後續的非間距標記 (例如鼻化符 ~)
         current_char = s[i]
         i += 1
         while i < len(s) and unicodedata.category(s[i]) == 'Mn':
@@ -78,14 +53,30 @@ def _tokenize_ipa(ipa_string: str) -> list:
     return phonemes
 # --- 4. 核心分析函數 (主入口) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
     """
     接收音訊檔案路徑和目標葡萄牙語句子，回傳詳細的發音分析字典。
-    (此函數結構與 en_us.py 完全對齊)
     """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
     # 1. 準備目標音素 (G2P)
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
     cleaned_sentence = " ".join(target_words_original)
@@ -134,6 +125,7 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
 # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。
@@ -185,6 +177,7 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
     return alignments_by_word
 # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     """
     將對齊結果格式化為最終的 JSON 結構。
@@ -242,4 +235,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
             "total_target_phonemes": total_phonemes
         },
         "words": words_data
-    }

 import re # 【保留】用於更準確地切分單詞
 # --- 2. 全域設定與模型載入 ---
+#    【已修改】移除了全域的 processor 和 model 變數。
+#    【已修改】刪除了舊的 load_model() 函數。
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
 # 【關鍵修改 1：設定為葡萄牙語 ASR 模型】
 MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
 # --- 3. 智能 IPA 切分函數 ---
 # 【關鍵修改 2：保留更優越的通用切分邏輯】
+# 【保持不變】
 def _tokenize_ipa(ipa_string: str) -> list:
     """
     將 IPA 字串智能地切分為音素列表，能正確處理帶有附加符號的組合字符。
     s = ipa_string.replace(' ', '')
     i = 0
     while i < len(s):
         if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
             phonemes.append(s[i:i+2])
             i += 2
             continue
         current_char = s[i]
         i += 1
         while i < len(s) and unicodedata.category(s[i]) == 'Mn':
     return phonemes
 # --- 4. 核心分析函數 (主入口) ---
+#    【已修改】將模型載入和快取邏輯合併至此。
+def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
     """
     接收音訊檔案路徑和目標葡萄牙語句子，回傳詳細的發音分析字典。
+    模型會被載入並儲存在此函數獨立的 'cache' 中，實現狀態隔離。
     """
+    # 檢查快取中是否已有模型，如果沒有則載入
+    if "model" not in cache:
+        print(f"快取未命中 (ASR_pt_br)。正在載入模型 '{MODEL_NAME}'...")
+        try:
+            # 載入模型並存入此函數的快取字典
+            cache["processor"] = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            cache["model"] = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            cache["model"].to(DEVICE)
+            print(f"模型 '{MODEL_NAME}' 已載入並快取。")
+        except Exception as e:
+            print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
+            raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
+    # 從此函數的獨立快取中獲取模型和處理器
+    processor = cache["processor"]
+    model = cache["model"]
+    # --- 以下為原始分析邏輯，保持不變 ---
     # 1. 準備目標音素 (G2P)
     target_words_original = re.findall(r"[\w'-]+", target_sentence)
     cleaned_sentence = " ".join(target_words_original)
 # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
+# 【保持不變】
 def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
     """
     使用動態規劃執行音素對齊。
     return alignments_by_word
 # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
+# 【保持不變】
 def _format_to_json_structure(alignments, sentence, original_words) -> dict:
     """
     將對齊結果格式化為最終的 JSON 結構。
             "total_target_phonemes": total_phonemes
         },
         "words": words_data
+    }

main.py CHANGED Viewed

@@ -77,7 +77,6 @@ async def lifespan(app: FastAPI):
             try:
                 print(f"--- Loading model for language: {lang} ---")
                 analyzer_module = importlib.import_module(f"analyzer.ASR_{lang}")
-                analyzer_module.load_model()
                 ANALYZERS[lang] = analyzer_module
                 print(f"--- Model for {lang} loaded successfully. ---")
             except Exception as e:
@@ -127,7 +126,6 @@ def get_analyzer_module(language: str):
     print(f"'{language}' not in cache. Loading on-demand (development mode)...")
     try:
         analyzer_module = importlib.import_module(f"analyzer.ASR_{language}")
-        analyzer_module.load_model()
         ANALYZERS[language] = analyzer_module
         print(f"'{language}' analyzer loaded and cached successfully.")
         return analyzer_module

             try:
                 print(f"--- Loading model for language: {lang} ---")
                 analyzer_module = importlib.import_module(f"analyzer.ASR_{lang}")
                 ANALYZERS[lang] = analyzer_module
                 print(f"--- Model for {lang} loaded successfully. ---")
             except Exception as e:
     print(f"'{language}' not in cache. Loading on-demand (development mode)...")
     try:
         analyzer_module = importlib.import_module(f"analyzer.ASR_{language}")
         ANALYZERS[language] = analyzer_module
         print(f"'{language}' analyzer loaded and cached successfully.")
         return analyzer_module