|
import librosa |
|
from transformers import AutoFeatureExtractor, Wav2Vec2BertModel |
|
import soundfile as sf |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
|
|
|
|
model_id = "facebook/w2v-bert-2.0" |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) |
|
model = Wav2Vec2BertModel.from_pretrained(model_id) |
|
|
|
def load_and_resample_audio(file_path, target_sample_rate=16000): |
|
audio_input, sample_rate = sf.read(file_path) |
|
if sample_rate != target_sample_rate: |
|
audio_input = librosa.resample( |
|
audio_input, orig_sr=sample_rate, target_sr=target_sample_rate |
|
) |
|
return audio_input, target_sample_rate |
|
|
|
def calculate_mfcc(audio_data, sample_rate): |
|
mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13) |
|
mfccs_scaled = np.mean(mfccs.T, axis=0) |
|
return mfccs_scaled |
|
|
|
def calculate_similarity(mfccs1, mfccs2): |
|
similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1)) |
|
return similarity[0][0] |
|
|
|
def mfcc_similarty_check(original: str, recorded: str): |
|
correct_pronunciation_audio, _ = load_and_resample_audio(original) |
|
user_pronunciation_audio, sample_rate = load_and_resample_audio(recorded) |
|
|
|
|
|
correct_mfccs = calculate_mfcc(correct_pronunciation_audio.flatten(), sample_rate) |
|
user_mfccs = calculate_mfcc(user_pronunciation_audio.flatten(), sample_rate) |
|
|
|
distance = np.linalg.norm(correct_mfccs.flatten() - user_mfccs.flatten()) |
|
|
|
|
|
|
|
similarity_score = calculate_similarity(correct_mfccs, user_mfccs) |
|
accuracy_percentage = similarity_score * 100 |
|
|
|
return distance, accuracy_percentage |
|
|