CAPT-ReadAloud / logic.py
seba3y's picture
Update logic.py
32b5423
raw
history blame
2.56 kB
from phonemizer.separator import Separator
from phonemizer import phonemize, backend
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
from Levenshtein import distance as levenshtein_distance
import whisper
import torch
if not backend.EspeakBackend.is_available():
import os
os.system('apt-get install espeak-ng')
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = whisper.load_model("base.en", device=device)
separator = Separator(phone=None, word=' | ',)
# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
def transcribe(audio):
result = model.transcribe(audio, word_timestamps=False, no_speech_threshold=0.4, compression_ratio_threshold=2, temperature=0)
return {'language': result['language'], 'text': result['text']}
def text2phoneme(text):
return phonemize(text.lower(), backend='espeak' , separator=separator, strip=True, with_stress=False, tie=False, language='en-us')
def rate_pronunciation(expected_phonemes, actual_phonemes):
expected_phonemes = expected_phonemes.split(" | ")
actual_phonemes = actual_phonemes.split(" | ")
# Calculate the Levenshtein distance between the two phoneme sequences
results = []
for i, base_word in enumerate(actual_phonemes):
best_dist = float('inf')
error_threshold = len(base_word) * 0.45
for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
if dist < best_dist:
best_dist = dist
if best_dist == 0: # Early stopping on perfect match
break
if best_dist == 0:
results.append(3)
elif best_dist <= error_threshold:
results.append(2)
else:
results.append(1)
return results
def compare_audio_with_text(audio, text):
transcribtion = transcribe(audio)['text']
print(transcribtion)
transcribtion = text2phoneme(transcribtion)
text_phone = text2phoneme(text)
scores = rate_pronunciation(transcribtion, text_phone)
result = [(word, s) for word, s in zip(text.split(), scores)]
return result
if __name__ == '__main__':
text = 'i have ADHD '
text = text2phoneme(text)
file_path = r'user_recording.wav'
trans = transcribe(file_path)['text']
print(trans)
trans = text2phoneme(trans)
print('base:', text)
print('predicted:', trans)
result = rate_pronunciation(trans, text)
print(result)