Spaces:

HK0712
/

FYP_ASR_Service

Running

App Files Files Community

HK0712 commited on Sep 28

Commit

13d62bd

1 Parent(s): 05d5403

download git (docker)

Browse files

Files changed (5) hide show

.gitignore +21 -21
Dockerfile +26 -25
analyzer/ASR_en_us.py +239 -239
cmudict_ipa.json +0 -0
requirements.txt +9 -9

.gitignore CHANGED Viewed

@@ -1,21 +1,21 @@
-# Python
-__pycache__/
-*.pyc
-*.pyo
-*.pyd
-.env
-venv/
-env/
-# IDE / Editor
-.vscode/
-.idea/
-# ASR Models (非常重要，模型檔案通常很大)
-ASRs/
-# Temporary files
-temp_audio/
-# macOS
-.DS_Store

+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+venv/
+env/
+# IDE / Editor
+.vscode/
+.idea/
+# ASR Models (非常重要，模型檔案通常很大)
+ASRs/
+# Temporary files
+temp_audio/
+# macOS
+.DS_Store

Dockerfile CHANGED Viewed

@@ -1,25 +1,26 @@
-# 1. 選擇一個包含 Python 的官方 Linux 映像
-FROM python:3.10-slim
-# 2. 設定容器內的工作目錄
-WORKDIR /app
-# 3. 安裝系統級依賴 (最關鍵的一步：安裝 espeak-ng 和其他工具)
-#    -y 自動回答 'yes'
-#    --no-install-recommends 避免安裝不必要的建議套件，保持映像檔小巧
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    espeak-ng \
-    libsndfile1 \
-    ffmpeg \
-    wget && \
-    rm -rf /var/lib/apt/lists/*
-# 4. 複製 requirements.txt 檔案到容器中並安裝 Python 套件
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# 5. 將專案中的所有其他檔案複製到容器中
-COPY . .
-# 這行是可選的，它設定了當容器直接執行時的預設命令
-# CMD ["python", "your_script.py"]

+# 1. 選擇一個包含 Python 的官方 Linux 映像
+FROM python:3.10-slim
+# 2. 設定容器內的工作目錄
+WORKDIR /app
+# 3. 安裝系統級依賴 (最關鍵的一步：安裝 espeak-ng、git 和其他工具)
+#    -y 自動回答 'yes'
+#    --no-install-recommends 避免安裝不必要的建議套件，保持映像檔小巧
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    espeak-ng \
+    libsndfile1 \
+    ffmpeg \
+    wget \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+# 4. 複製 requirements.txt 檔案到容器中並安裝 Python 套件
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# 5. 將專案中的所有其他檔案複製到容器中
+COPY . .
+# 這行是可選的，它設定了當容器直接執行時的預設命令
+# CMD ["python", "your_script.py"]

analyzer/ASR_en_us.py CHANGED Viewed

@@ -1,239 +1,239 @@
-import torch
-import soundfile as sf
-import librosa
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import os
-from phonemizer import phonemize
-import numpy as np
-from datetime import datetime, timezone
-# --- 1. 全域設定與模型載入函數 (保持不變) ---
-MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
-MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
-processor = None
-model = None
-def load_model():
-    """
-    在應用程式啟動時載入模型和處理器。
-    如果模型已載入，則跳過。
-    """
-    global processor, model
-    if processor and model:
-        print("英文模型已載入，跳過。")
-        return True
-    print(f"正在準備英文 (en-us) ASR 模型 '{MODEL_NAME}'...")
-    try:
-        if not os.path.exists(MODEL_SAVE_PATH):
-            print(f"本地找不到模型，正在從 Hugging Face 下載並儲存...")
-            processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-            model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-            processor_to_save.save_pretrained(MODEL_SAVE_PATH)
-            model_to_save.save_pretrained(MODEL_SAVE_PATH)
-            print("模型已成功下載並儲存。")
-        else:
-            print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
-        processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
-        model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
-        print("英文 (en-us) 模型和處理器載入成功！")
-        return True
-    except Exception as e:
-        print(f"處理或載入 en-us 模型時發生錯誤: {e}")
-        raise RuntimeError(f"Failed to load en-us model: {e}")
-# --- 2. 智能 IPA 切分函數 (已更新) ---
-# 移除了包含 'ː' 的組合，因為我們將在源頭移除它
-MULTI_CHAR_PHONEMES = {
-    'tʃ', 'dʒ', # 輔音 (Affricates)
-    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
-    'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
-}
-def _tokenize_ipa(ipa_string: str) -> list:
-    """
-    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
-    """
-    phonemes = []
-    i = 0
-    s = ipa_string.replace(' ', '')
-    while i < len(s):
-        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
-            phonemes.append(s[i:i+2])
-            i += 2
-        else:
-            phonemes.append(s[i])
-            i += 1
-    return phonemes
-# --- 3. 核心分析函數 (主入口) (已修改) ---
-def analyze(audio_file_path: str, target_sentence: str) -> dict:
-    """
-    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
-    這是此模組的主要進入點。
-    """
-    if not processor or not model:
-        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
-    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
-    # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
-    # 在切分前，移除所有重音和長音符號，以匹配 ASR 的輸出特性
-    target_ipa_by_word = [
-        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
-        for word in target_ipa_by_word_str
-    ]
-    target_words_original = target_sentence.split()
-    try:
-        speech, sample_rate = sf.read(audio_file_path)
-        if sample_rate != 16000:
-            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
-    except Exception as e:
-        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
-    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    user_ipa_full = processor.decode(predicted_ids[0])
-    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
-    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
-# --- 4. 對齊函數 (與上一版相同) ---
-def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
-    """
-    (已修改) 使用新的切分邏輯執行音素對齊。
-    """
-    user_phonemes = _tokenize_ipa(user_phoneme_str)
-    target_phonemes_flat = []
-    word_boundaries_indices = []
-    current_idx = 0
-    for word_ipa_tokens in target_words_ipa_tokenized:
-        target_phonemes_flat.extend(word_ipa_tokens)
-        current_idx += len(word_ipa_tokens)
-        word_boundaries_indices.append(current_idx - 1)
-    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
-    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
-    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
-    for i in range(1, len(user_phonemes) + 1):
-        for j in range(1, len(target_phonemes_flat) + 1):
-            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
-            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
-    i, j = len(user_phonemes), len(target_phonemes_flat)
-    user_path, target_path = [], []
-    while i > 0 or j > 0:
-        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
-        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
-        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
-            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
-        else:
-            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
-    alignments_by_word = []
-    word_start_idx_in_path = 0
-    target_phoneme_counter_in_path = 0
-    for path_idx, p in enumerate(target_path):
-        if p != '-':
-            if target_phoneme_counter_in_path in word_boundaries_indices:
-                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
-                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
-                alignments_by_word.append({
-                    "target": target_alignment,
-                    "user": user_alignment
-                })
-                word_start_idx_in_path = path_idx + 1
-            target_phoneme_counter_in_path += 1
-    return alignments_by_word
-# --- 5. 格式化函數 (與上一版相同) ---
-def _format_to_json_structure(alignments, sentence, original_words) -> dict:
-    total_phonemes = 0
-    total_errors = 0
-    correct_words_count = 0
-    words_data = []
-    num_words_to_process = min(len(alignments), len(original_words))
-    for i in range(num_words_to_process):
-        alignment = alignments[i]
-        word_is_correct = True
-        phonemes_data = []
-        for j in range(len(alignment['target'])):
-            target_phoneme = alignment['target'][j]
-            user_phoneme = alignment['user'][j]
-            is_match = (user_phoneme == target_phoneme)
-            phonemes_data.append({
-                "target": target_phoneme,
-                "user": user_phoneme,
-                "isMatch": is_match
-            })
-            if not is_match:
-                word_is_correct = False
-                if not (user_phoneme == '-' and target_phoneme == '-'):
-                    total_errors += 1
-        if word_is_correct:
-            correct_words_count += 1
-        words_data.append({
-            "word": original_words[i],
-            "isCorrect": word_is_correct,
-            "phonemes": phonemes_data
-        })
-        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
-    total_words = len(original_words)
-    if len(alignments) < total_words:
-        for i in range(len(alignments), total_words):
-            # 確保這裡也移除 'ː'
-            missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
-            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
-            phonemes_data = []
-            for p_ipa in missed_word_ipa:
-                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
-                total_errors += 1
-                total_phonemes += 1
-            words_data.append({
-                "word": original_words[i],
-                "isCorrect": False,
-                "phonemes": phonemes_data
-            })
-    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
-    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
-    final_result = {
-        "sentence": sentence,
-        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
-        "summary": {
-            "overallScore": round(overall_score, 1),
-            "totalWords": total_words,
-            "correctWords": correct_words_count,
-            "phonemeErrorRate": round(phoneme_error_rate, 2),
-            "total_errors": total_errors,
-            "total_target_phonemes": total_phonemes
-        },
-        "words": words_data
-    }
-    return final_result

+import torch
+import soundfile as sf
+import librosa
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import os
+from phonemizer import phonemize
+import numpy as np
+from datetime import datetime, timezone
+# --- 1. 全域設定與模型載入函數 (保持不變) ---
+MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
+MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
+processor = None
+model = None
+def load_model():
+    """
+    在應用程式啟動時載入模型和處理器。
+    如果模型已載入，則跳過。
+    """
+    global processor, model
+    if processor and model:
+        print("英文模型已載入，跳過。")
+        return True
+    print(f"正在準備英文 (en-us) ASR 模型 '{MODEL_NAME}'...")
+    try:
+        if not os.path.exists(MODEL_SAVE_PATH):
+            print(f"本地找不到模型，正在從 Hugging Face 下載並儲存...")
+            processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+            model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+            processor_to_save.save_pretrained(MODEL_SAVE_PATH)
+            model_to_save.save_pretrained(MODEL_SAVE_PATH)
+            print("模型已成功下載並儲存。")
+        else:
+            print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
+        processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
+        model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
+        print("英文 (en-us) 模型和處理器載入成功！")
+        return True
+    except Exception as e:
+        print(f"處理或載入 en-us 模型時發生錯誤: {e}")
+        raise RuntimeError(f"Failed to load en-us model: {e}")
+# --- 2. 智能 IPA 切分函數 (已更新) ---
+# 移除了包含 'ː' 的組合，因為我們將在源頭移除它
+MULTI_CHAR_PHONEMES = {
+    'tʃ', 'dʒ', # 輔音 (Affricates)
+    'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
+    'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
+}
+def _tokenize_ipa(ipa_string: str) -> list:
+    """
+    將 IPA 字串智能地切分為音素列表，能正確處理多字元音素。
+    """
+    phonemes = []
+    i = 0
+    s = ipa_string.replace(' ', '')
+    while i < len(s):
+        if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
+            phonemes.append(s[i:i+2])
+            i += 2
+        else:
+            phonemes.append(s[i])
+            i += 1
+    return phonemes
+# --- 3. 核心分析函數 (主入口) (已修改) ---
+def analyze(audio_file_path: str, target_sentence: str) -> dict:
+    """
+    接收音訊檔案路徑和目標句子，回傳詳細的發音分析字典。
+    這是此模組的主要進入點。
+    """
+    if not processor or not model:
+        raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
+    target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
+    # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
+    # 在切分前，移除所有重音和長音符號，以匹配 ASR 的輸出特性
+    target_ipa_by_word = [
+        _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
+        for word in target_ipa_by_word_str
+    ]
+    target_words_original = target_sentence.split()
+    try:
+        speech, sample_rate = sf.read(audio_file_path)
+        if sample_rate != 16000:
+            speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
+    except Exception as e:
+        raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
+    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    user_ipa_full = processor.decode(predicted_ids[0])
+    word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
+    return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
+# --- 4. 對齊函數 (與上一版相同) ---
+def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
+    """
+    (已修改) 使用新的切分邏輯執行音素對齊。
+    """
+    user_phonemes = _tokenize_ipa(user_phoneme_str)
+    target_phonemes_flat = []
+    word_boundaries_indices = []
+    current_idx = 0
+    for word_ipa_tokens in target_words_ipa_tokenized:
+        target_phonemes_flat.extend(word_ipa_tokens)
+        current_idx += len(word_ipa_tokens)
+        word_boundaries_indices.append(current_idx - 1)
+    dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
+    for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
+    for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
+    for i in range(1, len(user_phonemes) + 1):
+        for j in range(1, len(target_phonemes_flat) + 1):
+            cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
+            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
+    i, j = len(user_phonemes), len(target_phonemes_flat)
+    user_path, target_path = [], []
+    while i > 0 or j > 0:
+        cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
+        if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
+        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
+            user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
+        else:
+            user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
+    alignments_by_word = []
+    word_start_idx_in_path = 0
+    target_phoneme_counter_in_path = 0
+    for path_idx, p in enumerate(target_path):
+        if p != '-':
+            if target_phoneme_counter_in_path in word_boundaries_indices:
+                target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
+                user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
+                alignments_by_word.append({
+                    "target": target_alignment,
+                    "user": user_alignment
+                })
+                word_start_idx_in_path = path_idx + 1
+            target_phoneme_counter_in_path += 1
+    return alignments_by_word
+# --- 5. 格式化函數 (與上一版相同) ---
+def _format_to_json_structure(alignments, sentence, original_words) -> dict:
+    total_phonemes = 0
+    total_errors = 0
+    correct_words_count = 0
+    words_data = []
+    num_words_to_process = min(len(alignments), len(original_words))
+    for i in range(num_words_to_process):
+        alignment = alignments[i]
+        word_is_correct = True
+        phonemes_data = []
+        for j in range(len(alignment['target'])):
+            target_phoneme = alignment['target'][j]
+            user_phoneme = alignment['user'][j]
+            is_match = (user_phoneme == target_phoneme)
+            phonemes_data.append({
+                "target": target_phoneme,
+                "user": user_phoneme,
+                "isMatch": is_match
+            })
+            if not is_match:
+                word_is_correct = False
+                if not (user_phoneme == '-' and target_phoneme == '-'):
+                    total_errors += 1
+        if word_is_correct:
+            correct_words_count += 1
+        words_data.append({
+            "word": original_words[i],
+            "isCorrect": word_is_correct,
+            "phonemes": phonemes_data
+        })
+        total_phonemes += sum(1 for p in alignment['target'] if p != '-')
+    total_words = len(original_words)
+    if len(alignments) < total_words:
+        for i in range(len(alignments), total_words):
+            # 確保這裡也移除 'ː'
+            missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
+            missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
+            phonemes_data = []
+            for p_ipa in missed_word_ipa:
+                phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
+                total_errors += 1
+                total_phonemes += 1
+            words_data.append({
+                "word": original_words[i],
+                "isCorrect": False,
+                "phonemes": phonemes_data
+            })
+    overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
+    phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
+    final_result = {
+        "sentence": sentence,
+        "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
+        "summary": {
+            "overallScore": round(overall_score, 1),
+            "totalWords": total_words,
+            "correctWords": correct_words_count,
+            "phonemeErrorRate": round(phoneme_error_rate, 2),
+            "total_errors": total_errors,
+            "total_target_phonemes": total_phonemes
+        },
+        "words": words_data
+    }
+    return final_result

cmudict_ipa.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
-fastapi
-uvicorn[standard]
-pyngrok
-python-multipart
-torch
-soundfile
-librosa
-transformers
-phonemizer[espeak]
 numpy

+fastapi
+uvicorn[standard]
+pyngrok
+python-multipart
+torch
+soundfile
+librosa
+transformers
+phonemizer[espeak]
 numpy