Spaces:
Running
Running
Delete logic.py
Browse files
logic.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
from phonemizer.separator import Separator
|
2 |
-
from phonemizer import phonemize
|
3 |
-
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
-
from Levenshtein import distance as levenshtein_distance
|
5 |
-
from scoring import calculate_fluency_and_pronunciation
|
6 |
-
|
7 |
-
import whisper
|
8 |
-
import torch
|
9 |
-
|
10 |
-
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
11 |
-
|
12 |
-
model = whisper.load_model("base.en", device=device)
|
13 |
-
separator = Separator(phone=None, word='',)
|
14 |
-
|
15 |
-
# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
|
16 |
-
|
17 |
-
def transcribe(audio):
|
18 |
-
result = model.transcribe(audio, word_timestamps=False, no_speech_threshold=0.4, compression_ratio_threshold=2, temperature=0)
|
19 |
-
return {'language': result['language'], 'text': result['text']}
|
20 |
-
|
21 |
-
def text2phoneme(text):
|
22 |
-
return phonemize(text.lower().split(), backend='espeak' , separator=separator, strip=True, with_stress=False, tie=False, language='en-us')
|
23 |
-
|
24 |
-
def rate_pronunciation(expected_phonemes, actual_phonemes):
|
25 |
-
expected_phonemes = expected_phonemes
|
26 |
-
actual_phonemes = actual_phonemes
|
27 |
-
# Calculate the Levenshtein distance between the two phoneme sequences
|
28 |
-
results = []
|
29 |
-
for i, base_word in enumerate(actual_phonemes):
|
30 |
-
best_dist = float('inf')
|
31 |
-
if i <= len(expected_phonemes):
|
32 |
-
for j in range(max(0, i-1), i + min(3, len(expected_phonemes) - i)):
|
33 |
-
dist = levenshtein_distance(expected_phonemes[j], base_word,)
|
34 |
-
if dist < best_dist:
|
35 |
-
best_dist = dist
|
36 |
-
if best_dist == 0: # Early stopping on perfect match
|
37 |
-
break
|
38 |
-
error_threshold = len(base_word) * 0.40
|
39 |
-
if best_dist == 0:
|
40 |
-
results.append(3)
|
41 |
-
elif best_dist <= error_threshold:
|
42 |
-
results.append(2)
|
43 |
-
else:
|
44 |
-
results.append(1)
|
45 |
-
return results
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
def Speaker_speech_analysis(audio_path, text):
|
51 |
-
pre_transcribtion = transcribe(audio_path)['text']
|
52 |
-
print(pre_transcribtion)
|
53 |
-
transcribtion = text2phoneme(pre_transcribtion)
|
54 |
-
text_phone = text2phoneme(text)
|
55 |
-
scores = rate_pronunciation(transcribtion, text_phone)
|
56 |
-
FP_scores = calculate_fluency_and_pronunciation(audio_path, len(pre_transcribtion.split()), scores, len(text.split()))
|
57 |
-
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
58 |
-
|
59 |
-
FP_scores['word_scores'] = word_scores
|
60 |
-
return FP_scores
|
61 |
-
|
62 |
-
if __name__ == '__main__':
|
63 |
-
|
64 |
-
text = 'i have ADHD '
|
65 |
-
text = text2phoneme(text)
|
66 |
-
file_path = r'user_recording.wav'
|
67 |
-
trans = transcribe(file_path)['text']
|
68 |
-
print(trans)
|
69 |
-
trans = text2phoneme(trans)
|
70 |
-
print('base:', text)
|
71 |
-
print('predicted:', trans)
|
72 |
-
result = rate_pronunciation(trans, text)
|
73 |
-
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|