mother-tongue / app /mfcc.py
r3Vibe's picture
update
24f6bf4
raw
history blame contribute delete
No virus
1.75 kB
import librosa
from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
import soundfile as sf
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Model and feature extractor (same as before)
model_id = "facebook/w2v-bert-2.0"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = Wav2Vec2BertModel.from_pretrained(model_id)
def load_and_resample_audio(file_path, target_sample_rate=16000):
audio_input, sample_rate = sf.read(file_path)
if sample_rate != target_sample_rate:
audio_input = librosa.resample(
audio_input, orig_sr=sample_rate, target_sr=target_sample_rate
)
return audio_input, sample_rate
def calculate_mfcc(audio_data, sample_rate):
mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate)
mfccs_scaled = np.mean(mfccs.T, axis=0) # Average across time dimension
return mfccs_scaled
def calculate_similarity(mfccs1, mfccs2):
similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1))
return similarity[0][0]
def mfcc_similarty_check(original: str, recorded: str):
correct_pronunciation_audio, _ = load_and_resample_audio(original)
user_pronunciation_audio, sample_rate = load_and_resample_audio(recorded)
# Extract MFCCs from audio data
correct_mfccs = calculate_mfcc(correct_pronunciation_audio, sample_rate)
user_mfccs = calculate_mfcc(user_pronunciation_audio, sample_rate)
distance = np.linalg.norm(correct_mfccs.flatten() - user_mfccs.flatten())
# Calculate cosine similarity using MFCCs
similarity_score = calculate_similarity(correct_mfccs, user_mfccs)
accuracy_percentage = similarity_score * 100
return distance, accuracy_percentage