Spaces:
Runtime error
Runtime error
import librosa | |
from transformers import AutoFeatureExtractor, Wav2Vec2BertModel | |
import soundfile as sf | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
# Model and feature extractor (same as before) | |
model_id = "facebook/w2v-bert-2.0" | |
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) | |
model = Wav2Vec2BertModel.from_pretrained(model_id) | |
def load_and_resample_audio(file_path, target_sample_rate=16000): | |
audio_input, sample_rate = sf.read(file_path) | |
if sample_rate != target_sample_rate: | |
audio_input = librosa.resample( | |
audio_input, orig_sr=sample_rate, target_sr=target_sample_rate | |
) | |
return audio_input, sample_rate | |
def calculate_mfcc(audio_data, sample_rate): | |
mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate) | |
mfccs_scaled = np.mean(mfccs.T, axis=0) # Average across time dimension | |
return mfccs_scaled | |
def calculate_similarity(mfccs1, mfccs2): | |
similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1)) | |
return similarity[0][0] | |
def mfcc_similarty_check(original: str, recorded: str): | |
correct_pronunciation_audio, _ = load_and_resample_audio(original) | |
user_pronunciation_audio, sample_rate = load_and_resample_audio(recorded) | |
# Extract MFCCs from audio data | |
correct_mfccs = calculate_mfcc(correct_pronunciation_audio, sample_rate) | |
user_mfccs = calculate_mfcc(user_pronunciation_audio, sample_rate) | |
distance = np.linalg.norm(correct_mfccs.flatten() - user_mfccs.flatten()) | |
# Calculate cosine similarity using MFCCs | |
similarity_score = calculate_similarity(correct_mfccs, user_mfccs) | |
accuracy_percentage = similarity_score * 100 | |
return distance, accuracy_percentage | |