HK0712 commited on
Commit
13d62bd
·
1 Parent(s): 05d5403

download git (docker)

Browse files
Files changed (5) hide show
  1. .gitignore +21 -21
  2. Dockerfile +26 -25
  3. analyzer/ASR_en_us.py +239 -239
  4. cmudict_ipa.json +0 -0
  5. requirements.txt +9 -9
.gitignore CHANGED
@@ -1,21 +1,21 @@
1
- # Python
2
- __pycache__/
3
- *.pyc
4
- *.pyo
5
- *.pyd
6
- .env
7
- venv/
8
- env/
9
-
10
- # IDE / Editor
11
- .vscode/
12
- .idea/
13
-
14
- # ASR Models (非常重要,模型檔案通常很大)
15
- ASRs/
16
-
17
- # Temporary files
18
- temp_audio/
19
-
20
- # macOS
21
- .DS_Store
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .env
7
+ venv/
8
+ env/
9
+
10
+ # IDE / Editor
11
+ .vscode/
12
+ .idea/
13
+
14
+ # ASR Models (非常重要,模型檔案通常很大)
15
+ ASRs/
16
+
17
+ # Temporary files
18
+ temp_audio/
19
+
20
+ # macOS
21
+ .DS_Store
Dockerfile CHANGED
@@ -1,25 +1,26 @@
1
- # 1. 選擇一個包含 Python 的官方 Linux 映像
2
- FROM python:3.10-slim
3
-
4
- # 2. 設定容器內的工作目錄
5
- WORKDIR /app
6
-
7
- # 3. 安裝系統級依賴 (最關鍵的一步:安裝 espeak-ng 和其他工具)
8
- # -y 自動回答 'yes'
9
- # --no-install-recommends 避免安裝不必要的建議套件,保持映像檔小巧
10
- RUN apt-get update && apt-get install -y --no-install-recommends \
11
- espeak-ng \
12
- libsndfile1 \
13
- ffmpeg \
14
- wget && \
15
- rm -rf /var/lib/apt/lists/*
16
-
17
- # 4. 複製 requirements.txt 檔案到容器中並安裝 Python 套件
18
- COPY requirements.txt .
19
- RUN pip install --no-cache-dir -r requirements.txt
20
-
21
- # 5. 將專案中的所有其他檔案複製到容器中
22
- COPY . .
23
-
24
- # 這行是可選的,它設定了當容器直接執行時的預設命令
25
- # CMD ["python", "your_script.py"]
 
 
1
+ # 1. 選擇一個包含 Python 的官方 Linux 映像
2
+ FROM python:3.10-slim
3
+
4
+ # 2. 設定容器內的工作目錄
5
+ WORKDIR /app
6
+
7
+ # 3. 安裝系統級依賴 (最關鍵的一步:安裝 espeak-ng、git 和其他工具)
8
+ # -y 自動回答 'yes'
9
+ # --no-install-recommends 避免安裝不必要的建議套件,保持映像檔小巧
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ espeak-ng \
12
+ libsndfile1 \
13
+ ffmpeg \
14
+ wget \
15
+ git && \
16
+ rm -rf /var/lib/apt/lists/*
17
+
18
+ # 4. 複製 requirements.txt 檔案到容器中並安裝 Python 套件
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # 5. 將專案中的所有其他檔案複製到容器中
23
+ COPY . .
24
+
25
+ # 這行是可選的,它設定了當容器直接執行時的預設命令
26
+ # CMD ["python", "your_script.py"]
analyzer/ASR_en_us.py CHANGED
@@ -1,239 +1,239 @@
1
- import torch
2
- import soundfile as sf
3
- import librosa
4
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
- import os
6
- from phonemizer import phonemize
7
- import numpy as np
8
- from datetime import datetime, timezone
9
-
10
- # --- 1. 全域設定與模型載入函數 (保持不變) ---
11
- MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
12
- MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
13
-
14
- processor = None
15
- model = None
16
-
17
- def load_model():
18
- """
19
- 在應用程式啟動時載入模型和處理器。
20
- 如果模型已載入,則跳過。
21
- """
22
- global processor, model
23
- if processor and model:
24
- print("英文模型已載入,跳過。")
25
- return True
26
-
27
- print(f"正在準備英文 (en-us) ASR 模型 '{MODEL_NAME}'...")
28
- try:
29
- if not os.path.exists(MODEL_SAVE_PATH):
30
- print(f"本地找不到模型,正在從 Hugging Face 下載並儲存...")
31
- processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
32
- model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
33
- processor_to_save.save_pretrained(MODEL_SAVE_PATH)
34
- model_to_save.save_pretrained(MODEL_SAVE_PATH)
35
- print("模型已成功下載並儲存。")
36
- else:
37
- print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
38
-
39
- processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
40
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
41
- print("英文 (en-us) 模型和處理器載入成功!")
42
- return True
43
- except Exception as e:
44
- print(f"處理或載入 en-us 模型時發生錯誤: {e}")
45
- raise RuntimeError(f"Failed to load en-us model: {e}")
46
-
47
- # --- 2. 智能 IPA 切分函數 (已更新) ---
48
- # 移除了包含 'ː' 的組合,因為我們將在源頭移除它
49
- MULTI_CHAR_PHONEMES = {
50
- 'tʃ', 'dʒ', # 輔音 (Affricates)
51
- 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
52
- 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
53
- }
54
-
55
- def _tokenize_ipa(ipa_string: str) -> list:
56
- """
57
- 將 IPA 字串智能地切分為音素列表,能正確處理多字元音素。
58
- """
59
- phonemes = []
60
- i = 0
61
- s = ipa_string.replace(' ', '')
62
- while i < len(s):
63
- if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
64
- phonemes.append(s[i:i+2])
65
- i += 2
66
- else:
67
- phonemes.append(s[i])
68
- i += 1
69
- return phonemes
70
-
71
- # --- 3. 核心分析函數 (主入口) (已修改) ---
72
- def analyze(audio_file_path: str, target_sentence: str) -> dict:
73
- """
74
- 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
75
- 這是此模組的主要進入點。
76
- """
77
- if not processor or not model:
78
- raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
79
-
80
- target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
81
-
82
- # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
83
- # 在切分前,移除所有重音和長音符號,以匹配 ASR 的輸出特性
84
- target_ipa_by_word = [
85
- _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
86
- for word in target_ipa_by_word_str
87
- ]
88
- target_words_original = target_sentence.split()
89
-
90
- try:
91
- speech, sample_rate = sf.read(audio_file_path)
92
- if sample_rate != 16000:
93
- speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
94
- except Exception as e:
95
- raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
96
-
97
- input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
98
- with torch.no_grad():
99
- logits = model(input_values).logits
100
- predicted_ids = torch.argmax(logits, dim=-1)
101
- user_ipa_full = processor.decode(predicted_ids[0])
102
-
103
- word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
104
-
105
- return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
106
-
107
-
108
- # --- 4. 對齊函數 (與上一版相同) ---
109
- def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
110
- """
111
- (已修改) 使用新的切分邏輯執行音素對齊。
112
- """
113
- user_phonemes = _tokenize_ipa(user_phoneme_str)
114
-
115
- target_phonemes_flat = []
116
- word_boundaries_indices = []
117
- current_idx = 0
118
- for word_ipa_tokens in target_words_ipa_tokenized:
119
- target_phonemes_flat.extend(word_ipa_tokens)
120
- current_idx += len(word_ipa_tokens)
121
- word_boundaries_indices.append(current_idx - 1)
122
-
123
- dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
124
- for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
125
- for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
126
- for i in range(1, len(user_phonemes) + 1):
127
- for j in range(1, len(target_phonemes_flat) + 1):
128
- cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
129
- dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
130
-
131
- i, j = len(user_phonemes), len(target_phonemes_flat)
132
- user_path, target_path = [], []
133
- while i > 0 or j > 0:
134
- cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
135
- if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
136
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
137
- elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
138
- user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
139
- else:
140
- user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
141
-
142
- alignments_by_word = []
143
- word_start_idx_in_path = 0
144
- target_phoneme_counter_in_path = 0
145
-
146
- for path_idx, p in enumerate(target_path):
147
- if p != '-':
148
- if target_phoneme_counter_in_path in word_boundaries_indices:
149
- target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
150
- user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
151
-
152
- alignments_by_word.append({
153
- "target": target_alignment,
154
- "user": user_alignment
155
- })
156
-
157
- word_start_idx_in_path = path_idx + 1
158
-
159
- target_phoneme_counter_in_path += 1
160
-
161
- return alignments_by_word
162
-
163
- # --- 5. 格式化函數 (與上一版相同) ---
164
- def _format_to_json_structure(alignments, sentence, original_words) -> dict:
165
- total_phonemes = 0
166
- total_errors = 0
167
- correct_words_count = 0
168
- words_data = []
169
-
170
- num_words_to_process = min(len(alignments), len(original_words))
171
-
172
- for i in range(num_words_to_process):
173
- alignment = alignments[i]
174
- word_is_correct = True
175
- phonemes_data = []
176
-
177
- for j in range(len(alignment['target'])):
178
- target_phoneme = alignment['target'][j]
179
- user_phoneme = alignment['user'][j]
180
- is_match = (user_phoneme == target_phoneme)
181
-
182
- phonemes_data.append({
183
- "target": target_phoneme,
184
- "user": user_phoneme,
185
- "isMatch": is_match
186
- })
187
-
188
- if not is_match:
189
- word_is_correct = False
190
- if not (user_phoneme == '-' and target_phoneme == '-'):
191
- total_errors += 1
192
-
193
- if word_is_correct:
194
- correct_words_count += 1
195
-
196
- words_data.append({
197
- "word": original_words[i],
198
- "isCorrect": word_is_correct,
199
- "phonemes": phonemes_data
200
- })
201
-
202
- total_phonemes += sum(1 for p in alignment['target'] if p != '-')
203
-
204
- total_words = len(original_words)
205
- if len(alignments) < total_words:
206
- for i in range(len(alignments), total_words):
207
- # 確保這裡也移除 'ː'
208
- missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
209
- missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
210
- phonemes_data = []
211
- for p_ipa in missed_word_ipa:
212
- phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
213
- total_errors += 1
214
- total_phonemes += 1
215
-
216
- words_data.append({
217
- "word": original_words[i],
218
- "isCorrect": False,
219
- "phonemes": phonemes_data
220
- })
221
-
222
- overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
223
- phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
224
-
225
- final_result = {
226
- "sentence": sentence,
227
- "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
228
- "summary": {
229
- "overallScore": round(overall_score, 1),
230
- "totalWords": total_words,
231
- "correctWords": correct_words_count,
232
- "phonemeErrorRate": round(phoneme_error_rate, 2),
233
- "total_errors": total_errors,
234
- "total_target_phonemes": total_phonemes
235
- },
236
- "words": words_data
237
- }
238
-
239
- return final_result
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import os
6
+ from phonemizer import phonemize
7
+ import numpy as np
8
+ from datetime import datetime, timezone
9
+
10
+ # --- 1. 全域設定與模型載入函數 (保持不變) ---
11
+ MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
12
+ MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
13
+
14
+ processor = None
15
+ model = None
16
+
17
+ def load_model():
18
+ """
19
+ 在應用程式啟動時載入模型和處理器。
20
+ 如果模型已載入,則跳過。
21
+ """
22
+ global processor, model
23
+ if processor and model:
24
+ print("英文模型已載入,跳過。")
25
+ return True
26
+
27
+ print(f"正在準備英文 (en-us) ASR 模型 '{MODEL_NAME}'...")
28
+ try:
29
+ if not os.path.exists(MODEL_SAVE_PATH):
30
+ print(f"本地找不到模型,正在從 Hugging Face 下載並儲存...")
31
+ processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
32
+ model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
33
+ processor_to_save.save_pretrained(MODEL_SAVE_PATH)
34
+ model_to_save.save_pretrained(MODEL_SAVE_PATH)
35
+ print("模型已成功下載並儲存。")
36
+ else:
37
+ print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
38
+
39
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
40
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
41
+ print("英文 (en-us) 模型和處理器載入成功!")
42
+ return True
43
+ except Exception as e:
44
+ print(f"處理或載入 en-us 模型時發生錯誤: {e}")
45
+ raise RuntimeError(f"Failed to load en-us model: {e}")
46
+
47
+ # --- 2. 智能 IPA 切分函數 (已更新) ---
48
+ # 移除了包含 'ː' 的組合,因為我們將在源頭移除它
49
+ MULTI_CHAR_PHONEMES = {
50
+ 'tʃ', 'dʒ', # 輔音 (Affricates)
51
+ 'eɪ', 'aɪ', 'oʊ', 'aʊ', 'ɔɪ', # 雙元音 (Diphthongs)
52
+ 'ɪə', 'eə', 'ʊə', 'ər' # R-controlled 和其他組合
53
+ }
54
+
55
+ def _tokenize_ipa(ipa_string: str) -> list:
56
+ """
57
+ 將 IPA 字串智能地切分為音素列表,能正確處理多字元音素。
58
+ """
59
+ phonemes = []
60
+ i = 0
61
+ s = ipa_string.replace(' ', '')
62
+ while i < len(s):
63
+ if i + 1 < len(s) and s[i:i+2] in MULTI_CHAR_PHONEMES:
64
+ phonemes.append(s[i:i+2])
65
+ i += 2
66
+ else:
67
+ phonemes.append(s[i])
68
+ i += 1
69
+ return phonemes
70
+
71
+ # --- 3. 核心分析函數 (主入口) (已修改) ---
72
+ def analyze(audio_file_path: str, target_sentence: str) -> dict:
73
+ """
74
+ 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
75
+ 這是此模組的主要進入點。
76
+ """
77
+ if not processor or not model:
78
+ raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
79
+
80
+ target_ipa_by_word_str = phonemize(target_sentence, language='en-us', backend='espeak', with_stress=True, strip=True).split()
81
+
82
+ # 【【【【【 關 鍵 修 改 在 這 裡 】】】】】
83
+ # 在切分前,移除所有重音和長音符號,以匹配 ASR 的輸出特性
84
+ target_ipa_by_word = [
85
+ _tokenize_ipa(word.replace('ˌ', '').replace('ˈ', '').replace('ː', ''))
86
+ for word in target_ipa_by_word_str
87
+ ]
88
+ target_words_original = target_sentence.split()
89
+
90
+ try:
91
+ speech, sample_rate = sf.read(audio_file_path)
92
+ if sample_rate != 16000:
93
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
94
+ except Exception as e:
95
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
96
+
97
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
98
+ with torch.no_grad():
99
+ logits = model(input_values).logits
100
+ predicted_ids = torch.argmax(logits, dim=-1)
101
+ user_ipa_full = processor.decode(predicted_ids[0])
102
+
103
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
104
+
105
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
106
+
107
+
108
+ # --- 4. 對齊函數 (與上一版相同) ---
109
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
110
+ """
111
+ (已修改) 使用新的切分邏輯執行音素對齊。
112
+ """
113
+ user_phonemes = _tokenize_ipa(user_phoneme_str)
114
+
115
+ target_phonemes_flat = []
116
+ word_boundaries_indices = []
117
+ current_idx = 0
118
+ for word_ipa_tokens in target_words_ipa_tokenized:
119
+ target_phonemes_flat.extend(word_ipa_tokens)
120
+ current_idx += len(word_ipa_tokens)
121
+ word_boundaries_indices.append(current_idx - 1)
122
+
123
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
124
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
125
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
126
+ for i in range(1, len(user_phonemes) + 1):
127
+ for j in range(1, len(target_phonemes_flat) + 1):
128
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
129
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
130
+
131
+ i, j = len(user_phonemes), len(target_phonemes_flat)
132
+ user_path, target_path = [], []
133
+ while i > 0 or j > 0:
134
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
135
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
136
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
137
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
138
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
139
+ else:
140
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
141
+
142
+ alignments_by_word = []
143
+ word_start_idx_in_path = 0
144
+ target_phoneme_counter_in_path = 0
145
+
146
+ for path_idx, p in enumerate(target_path):
147
+ if p != '-':
148
+ if target_phoneme_counter_in_path in word_boundaries_indices:
149
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
150
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
151
+
152
+ alignments_by_word.append({
153
+ "target": target_alignment,
154
+ "user": user_alignment
155
+ })
156
+
157
+ word_start_idx_in_path = path_idx + 1
158
+
159
+ target_phoneme_counter_in_path += 1
160
+
161
+ return alignments_by_word
162
+
163
+ # --- 5. 格式化函數 (與上一版相同) ---
164
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
165
+ total_phonemes = 0
166
+ total_errors = 0
167
+ correct_words_count = 0
168
+ words_data = []
169
+
170
+ num_words_to_process = min(len(alignments), len(original_words))
171
+
172
+ for i in range(num_words_to_process):
173
+ alignment = alignments[i]
174
+ word_is_correct = True
175
+ phonemes_data = []
176
+
177
+ for j in range(len(alignment['target'])):
178
+ target_phoneme = alignment['target'][j]
179
+ user_phoneme = alignment['user'][j]
180
+ is_match = (user_phoneme == target_phoneme)
181
+
182
+ phonemes_data.append({
183
+ "target": target_phoneme,
184
+ "user": user_phoneme,
185
+ "isMatch": is_match
186
+ })
187
+
188
+ if not is_match:
189
+ word_is_correct = False
190
+ if not (user_phoneme == '-' and target_phoneme == '-'):
191
+ total_errors += 1
192
+
193
+ if word_is_correct:
194
+ correct_words_count += 1
195
+
196
+ words_data.append({
197
+ "word": original_words[i],
198
+ "isCorrect": word_is_correct,
199
+ "phonemes": phonemes_data
200
+ })
201
+
202
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
203
+
204
+ total_words = len(original_words)
205
+ if len(alignments) < total_words:
206
+ for i in range(len(alignments), total_words):
207
+ # 確保這裡也移除 'ː'
208
+ missed_word_ipa_str = phonemize(original_words[i], language='en-us', backend='espeak', strip=True).replace('ː', '')
209
+ missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
210
+ phonemes_data = []
211
+ for p_ipa in missed_word_ipa:
212
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
213
+ total_errors += 1
214
+ total_phonemes += 1
215
+
216
+ words_data.append({
217
+ "word": original_words[i],
218
+ "isCorrect": False,
219
+ "phonemes": phonemes_data
220
+ })
221
+
222
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
223
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
224
+
225
+ final_result = {
226
+ "sentence": sentence,
227
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
228
+ "summary": {
229
+ "overallScore": round(overall_score, 1),
230
+ "totalWords": total_words,
231
+ "correctWords": correct_words_count,
232
+ "phonemeErrorRate": round(phoneme_error_rate, 2),
233
+ "total_errors": total_errors,
234
+ "total_target_phonemes": total_phonemes
235
+ },
236
+ "words": words_data
237
+ }
238
+
239
+ return final_result
cmudict_ipa.json CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,10 +1,10 @@
1
- fastapi
2
- uvicorn[standard]
3
- pyngrok
4
- python-multipart
5
- torch
6
- soundfile
7
- librosa
8
- transformers
9
- phonemizer[espeak]
10
  numpy
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pyngrok
4
+ python-multipart
5
+ torch
6
+ soundfile
7
+ librosa
8
+ transformers
9
+ phonemizer[espeak]
10
  numpy