import librosa from transformers import AutoFeatureExtractor, Wav2Vec2BertModel import soundfile as sf from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Model and feature extractor (same as before) model_id = "facebook/w2v-bert-2.0" feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) model = Wav2Vec2BertModel.from_pretrained(model_id) def load_and_resample_audio(file_path, target_sample_rate=16000): audio_input, sample_rate = sf.read(file_path) if sample_rate != target_sample_rate: audio_input = librosa.resample( audio_input, orig_sr=sample_rate, target_sr=target_sample_rate ) return audio_input, sample_rate def calculate_mfcc(audio_data, sample_rate): mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate) mfccs_scaled = np.mean(mfccs.T, axis=0) # Average across time dimension return mfccs_scaled def calculate_similarity(mfccs1, mfccs2): similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1)) return similarity[0][0] def mfcc_similarty_check(original: str, recorded: str): correct_pronunciation_audio, _ = load_and_resample_audio(original) user_pronunciation_audio, sample_rate = load_and_resample_audio(recorded) # Extract MFCCs from audio data correct_mfccs = calculate_mfcc(correct_pronunciation_audio, sample_rate) user_mfccs = calculate_mfcc(user_pronunciation_audio, sample_rate) distance = np.linalg.norm(correct_mfccs.flatten() - user_mfccs.flatten()) # Calculate cosine similarity using MFCCs similarity_score = calculate_similarity(correct_mfccs, user_mfccs) accuracy_percentage = similarity_score * 100 return distance, accuracy_percentage