pronunciation_assessment / audio_analysis.py
Shago's picture
Update audio_analysis.py
0de5606 verified
import librosa
import matplotlib.pyplot as plt
import pandas as pd
from utils import word_by_word_table
import shutil
import os
def compare_audio(tts_path, user_audio_path):
if isinstance(tts_path, str):
y_tts, sr_tts = librosa.load(tts_path, sr=None)
elif isinstance(tts_path, tuple):
sr_tts, y_tts = tts_path
else:
raise ValueError("Invalid gTTS input type")
if user_audio_path is None:
return None
if isinstance(user_audio_path, str):
y_user, sr_user = librosa.load(user_audio_path, sr=None)
elif isinstance(user_audio_path, tuple):
sr_user, y_user = user_audio_path
else:
raise ValueError("Invalid user audio input type")
min_len = min(len(y_tts), len(y_user))
y_tts, y_user = y_tts[:min_len], y_user[:min_len]
fig, ax = plt.subplots(3, 1, figsize=(10, 7))
ax[0].plot(y_tts)
ax[0].set_title("Reference (gTTS) Audio")
ax[1].plot(y_user)
ax[1].set_title("Your Recorded Audio")
ax[2].plot(y_tts - y_user, color="red")
ax[2].set_title("Difference (Reference - Recorded)")
plt.tight_layout()
return fig
def compare_both(tts_path, user_audio_path):
fig = compare_audio(tts_path, user_audio_path)
table = word_by_word_table(tts_path, user_audio_path)
return fig, table
def reset_all():
return ("", None, None, None, None, None)