|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer |
|
import librosa |
|
import numpy as np |
|
import plotly.graph_objects as go |
|
import warnings |
|
import os |
|
from scipy.stats import kurtosis, skew |
|
warnings.filterwarnings('ignore') |
|
|
|
def extract_prosodic_features(waveform, sr): |
|
"""Extract prosodic features from audio""" |
|
try: |
|
features = {} |
|
|
|
|
|
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr) |
|
f0_contour = [] |
|
for t in range(pitches.shape[1]): |
|
pitches_at_t = pitches[:, t] |
|
mags = magnitudes[:, t] |
|
pitch_index = mags.argmax() |
|
f0_contour.append(pitches[pitch_index, t]) |
|
f0_contour = np.array(f0_contour) |
|
f0_contour = f0_contour[f0_contour > 0] |
|
|
|
if len(f0_contour) > 0: |
|
features['pitch_mean'] = np.mean(f0_contour) |
|
features['pitch_std'] = np.std(f0_contour) |
|
features['pitch_range'] = np.ptp(f0_contour) |
|
else: |
|
features['pitch_mean'] = 0 |
|
features['pitch_std'] = 0 |
|
features['pitch_range'] = 0 |
|
|
|
|
|
rms = librosa.feature.rms(y=waveform)[0] |
|
features['energy_mean'] = np.mean(rms) |
|
features['energy_std'] = np.std(rms) |
|
features['energy_range'] = np.ptp(rms) |
|
|
|
|
|
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr) |
|
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr) |
|
features['tempo'] = tempo[0] |
|
|
|
|
|
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0] |
|
features['spectral_centroid_mean'] = np.mean(spectral_centroids) |
|
|
|
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0] |
|
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) |
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) |
|
for i in range(13): |
|
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i]) |
|
features[f'mfcc_{i}_std'] = np.std(mfccs[i]) |
|
|
|
return features |
|
|
|
except Exception as e: |
|
print(f"Error in extract_prosodic_features: {str(e)}") |
|
return None |
|
|
|
def create_feature_plots(features): |
|
"""Create visualizations for audio features""" |
|
try: |
|
|
|
fig = go.Figure() |
|
|
|
|
|
pitch_data = { |
|
'Mean': features['pitch_mean'], |
|
'Std Dev': features['pitch_std'], |
|
'Range': features['pitch_range'] |
|
} |
|
|
|
fig.add_trace(go.Bar( |
|
name='Pitch Features', |
|
x=list(pitch_data.keys()), |
|
y=list(pitch_data.values()), |
|
marker_color='blue' |
|
)) |
|
|
|
|
|
energy_data = { |
|
'Mean': features['energy_mean'], |
|
'Std Dev': features['energy_std'], |
|
'Range': features['energy_range'] |
|
} |
|
|
|
fig.add_trace(go.Bar( |
|
name='Energy Features', |
|
x=[f"Energy {k}" for k in energy_data.keys()], |
|
y=list(energy_data.values()), |
|
marker_color='red' |
|
)) |
|
|
|
|
|
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)] |
|
fig.add_trace(go.Scatter( |
|
name='MFCC Coefficients', |
|
y=mfcc_means, |
|
mode='lines+markers', |
|
marker_color='green' |
|
)) |
|
|
|
|
|
fig.update_layout( |
|
title='Voice Feature Analysis', |
|
showlegend=True, |
|
height=600, |
|
barmode='group' |
|
) |
|
|
|
return fig.to_html(include_plotlyjs=True) |
|
|
|
except Exception as e: |
|
print(f"Error in create_feature_plots: {str(e)}") |
|
return None |
|
|
|
def load_models(): |
|
"""Initialize and load all required models""" |
|
global processor, whisper_model, emotion_tokenizer, emotion_model |
|
|
|
try: |
|
print("Loading Whisper model...") |
|
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") |
|
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") |
|
|
|
print("Loading emotion model...") |
|
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
|
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
|
|
|
whisper_model.to("cpu") |
|
emotion_model.to("cpu") |
|
|
|
print("Models loaded successfully!") |
|
return True |
|
except Exception as e: |
|
print(f"Error loading models: {str(e)}") |
|
return False |
|
|
|
def create_emotion_plot(emotions): |
|
"""Create emotion analysis visualization""" |
|
try: |
|
fig = go.Figure(data=[ |
|
go.Bar( |
|
x=list(emotions.keys()), |
|
y=list(emotions.values()), |
|
marker_color='rgb(55, 83, 109)' |
|
) |
|
]) |
|
|
|
fig.update_layout( |
|
title='Emotion Analysis', |
|
xaxis_title='Emotion', |
|
yaxis_title='Score', |
|
yaxis_range=[0, 1], |
|
template='plotly_white', |
|
height=400 |
|
) |
|
|
|
return fig.to_html(include_plotlyjs=True) |
|
except Exception as e: |
|
print(f"Error creating emotion plot: {str(e)}") |
|
return None |
|
|
|
def analyze_audio(audio_input): |
|
"""Main function to analyze audio input""" |
|
try: |
|
if audio_input is None: |
|
return "Please provide an audio input", None, None |
|
|
|
print(f"Processing audio input: {type(audio_input)}") |
|
|
|
|
|
if isinstance(audio_input, tuple): |
|
audio_path = audio_input[0] |
|
else: |
|
audio_path = audio_input |
|
|
|
print(f"Loading audio from path: {audio_path}") |
|
|
|
|
|
waveform, sr = librosa.load(audio_path, sr=16000) |
|
print(f"Audio loaded: {waveform.shape}, SR: {sr}") |
|
|
|
|
|
print("Extracting voice features...") |
|
features = extract_prosodic_features(waveform, sr) |
|
if features is None: |
|
return "Error extracting voice features", None, None |
|
|
|
|
|
print("Creating feature visualizations...") |
|
feature_viz = create_feature_plots(features) |
|
if feature_viz is None: |
|
return "Error creating feature visualizations", None, None |
|
|
|
|
|
print("Transcribing audio...") |
|
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features |
|
|
|
with torch.no_grad(): |
|
predicted_ids = whisper_model.generate(inputs) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
print("Analyzing emotions...") |
|
emotion_inputs = emotion_tokenizer( |
|
transcription, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512 |
|
) |
|
|
|
with torch.no_grad(): |
|
emotion_outputs = emotion_model(**emotion_inputs) |
|
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) |
|
|
|
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
|
emotion_scores = { |
|
label: float(score) |
|
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) |
|
} |
|
|
|
|
|
emotion_viz = create_emotion_plot(emotion_scores) |
|
if emotion_viz is None: |
|
return "Error creating emotion visualization", None, None |
|
|
|
|
|
summary = f"""Voice Analysis Summary: |
|
|
|
Speech Content: |
|
{transcription} |
|
|
|
Voice Characteristics: |
|
- Average Pitch: {features['pitch_mean']:.2f} Hz |
|
- Pitch Variation: {features['pitch_std']:.2f} Hz |
|
- Speech Rate (Tempo): {features['tempo']:.2f} BPM |
|
- Voice Energy: {features['energy_mean']:.4f} |
|
|
|
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} |
|
""" |
|
|
|
return summary, emotion_viz, feature_viz |
|
|
|
except Exception as e: |
|
error_msg = f"Error in audio analysis: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, None |
|
|
|
|
|
print("Initializing application...") |
|
if not load_models(): |
|
raise RuntimeError("Failed to load required models") |
|
|
|
|
|
demo = gr.Interface( |
|
fn=analyze_audio, |
|
inputs=gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Audio Input" |
|
), |
|
outputs=[ |
|
gr.Textbox(label="Analysis Summary", lines=10), |
|
gr.HTML(label="Emotion Analysis"), |
|
gr.HTML(label="Voice Feature Analysis") |
|
], |
|
title="Voice Analysis System", |
|
description=""" |
|
This application analyzes voice recordings to extract various characteristics: |
|
|
|
1. Voice Features: |
|
- Pitch analysis |
|
- Energy patterns |
|
- Speech rate |
|
- Voice quality |
|
|
|
2. Emotional Content: |
|
- Emotion detection |
|
- Emotional intensity |
|
|
|
3. Speech Content: |
|
- Text transcription |
|
|
|
Upload an audio file or record directly through your microphone. |
|
""", |
|
examples=None, |
|
cache_examples=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |