import gradio as gr |
import torch |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer |
import librosa |
import numpy as np |
import plotly.graph_objects as go |
import warnings |
import os |
from scipy.stats import kurtosis, skew |
warnings.filterwarnings('ignore') |
def extract_prosodic_features(waveform, sr): |
"""Extract prosodic features from audio""" |
try: |
features = {} |
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr) |
f0_contour = [] |
for t in range(pitches.shape[1]): |
pitches_at_t = pitches[:, t] |
mags = magnitudes[:, t] |
pitch_index = mags.argmax() |
f0_contour.append(pitches[pitch_index, t]) |
f0_contour = np.array(f0_contour) |
f0_contour = f0_contour[f0_contour > 0] |
if len(f0_contour) > 0: |
features['pitch_mean'] = np.mean(f0_contour) |
features['pitch_std'] = np.std(f0_contour) |
features['pitch_range'] = np.ptp(f0_contour) |
else: |
features['pitch_mean'] = 0 |
features['pitch_std'] = 0 |
features['pitch_range'] = 0 |
rms = librosa.feature.rms(y=waveform)[0] |
features['energy_mean'] = np.mean(rms) |
features['energy_std'] = np.std(rms) |
features['energy_range'] = np.ptp(rms) |
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr) |
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr) |
features['tempo'] = tempo[0] |
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0] |
features['spectral_centroid_mean'] = np.mean(spectral_centroids) |
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0] |
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) |
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) |
for i in range(13): |
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i]) |
features[f'mfcc_{i}_std'] = np.std(mfccs[i]) |
return features |
except Exception as e: |
print(f"Error in extract_prosodic_features: {str(e)}") |
return None |
def create_feature_plots(features): |
"""Create visualizations for audio features""" |
try: |
fig = go.Figure() |
pitch_data = { |
'Mean': features['pitch_mean'], |
'Std Dev': features['pitch_std'], |
'Range': features['pitch_range'] |
} |
fig.add_trace(go.Bar( |
name='Pitch Features', |
x=list(pitch_data.keys()), |
y=list(pitch_data.values()), |
marker_color='blue' |
)) |
energy_data = { |
'Mean': features['energy_mean'], |
'Std Dev': features['energy_std'], |
'Range': features['energy_range'] |
} |
fig.add_trace(go.Bar( |
name='Energy Features', |
x=[f"Energy {k}" for k in energy_data.keys()], |
y=list(energy_data.values()), |
marker_color='red' |
)) |
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)] |
fig.add_trace(go.Scatter( |
name='MFCC Coefficients', |
y=mfcc_means, |
mode='lines+markers', |
marker_color='green' |
)) |
fig.update_layout( |
title='Voice Feature Analysis', |
showlegend=True, |
height=600, |
barmode='group' |
) |
return fig.to_html(include_plotlyjs=True) |
except Exception as e: |
print(f"Error in create_feature_plots: {str(e)}") |
return None |
def load_models(): |
"""Initialize and load all required models""" |
global processor, whisper_model, emotion_tokenizer, emotion_model |
try: |
print("Loading Whisper model...") |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") |
print("Loading emotion model...") |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
whisper_model.to("cpu") |
emotion_model.to("cpu") |
print("Models loaded successfully!") |
return True |
except Exception as e: |
print(f"Error loading models: {str(e)}") |
return False |
def create_emotion_plot(emotions): |
"""Create emotion analysis visualization""" |
try: |
fig = go.Figure(data=[ |
go.Bar( |
x=list(emotions.keys()), |
y=list(emotions.values()), |
marker_color='rgb(55, 83, 109)' |
) |
]) |
fig.update_layout( |
title='Emotion Analysis', |
xaxis_title='Emotion', |
yaxis_title='Score', |
yaxis_range=[0, 1], |
template='plotly_white', |
height=400 |
) |
return fig.to_html(include_plotlyjs=True) |
except Exception as e: |
print(f"Error creating emotion plot: {str(e)}") |
return None |
def analyze_audio(audio_input): |
"""Main function to analyze audio input""" |
try: |
if audio_input is None: |
return "Please provide an audio input", None, None |
print(f"Processing audio input: {type(audio_input)}") |
if isinstance(audio_input, tuple): |
audio_path = audio_input[0] |
else: |
audio_path = audio_input |
print(f"Loading audio from path: {audio_path}") |
waveform, sr = librosa.load(audio_path, sr=16000) |
print(f"Audio loaded: {waveform.shape}, SR: {sr}") |
print("Extracting voice features...") |
features = extract_prosodic_features(waveform, sr) |
if features is None: |
return "Error extracting voice features", None, None |
print("Creating feature visualizations...") |
feature_viz = create_feature_plots(features) |
if feature_viz is None: |
return "Error creating feature visualizations", None, None |
print("Transcribing audio...") |
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features |
with torch.no_grad(): |
predicted_ids = whisper_model.generate(inputs) |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
print("Analyzing emotions...") |
emotion_inputs = emotion_tokenizer( |
transcription, |
return_tensors="pt", |
padding=True, |
truncation=True, |
max_length=512 |
) |
with torch.no_grad(): |
emotion_outputs = emotion_model(**emotion_inputs) |
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
emotion_scores = { |
label: float(score) |
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) |
} |
emotion_viz = create_emotion_plot(emotion_scores) |
if emotion_viz is None: |
return "Error creating emotion visualization", None, None |
summary = f"""Voice Analysis Summary: |
Speech Content: |
{transcription} |
Voice Characteristics: |
- Average Pitch: {features['pitch_mean']:.2f} Hz |
- Pitch Variation: {features['pitch_std']:.2f} Hz |
- Speech Rate (Tempo): {features['tempo']:.2f} BPM |
- Voice Energy: {features['energy_mean']:.4f} |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} |
""" |
return summary, emotion_viz, feature_viz |
except Exception as e: |
error_msg = f"Error in audio analysis: {str(e)}" |
print(error_msg) |
return error_msg, None, None |
print("Initializing application...") |
if not load_models(): |
raise RuntimeError("Failed to load required models") |
demo = gr.Interface( |
fn=analyze_audio, |
inputs=gr.Audio( |
sources=["microphone", "upload"], |
type="filepath", |
label="Audio Input" |
), |
outputs=[ |
gr.Textbox(label="Analysis Summary", lines=10), |
gr.HTML(label="Emotion Analysis"), |
gr.HTML(label="Voice Feature Analysis") |
], |
title="Voice Analysis System", |
description=""" |
This application analyzes voice recordings to extract various characteristics: |
1. Voice Features: |
- Pitch analysis |
- Energy patterns |
- Speech rate |
- Voice quality |
2. Emotional Content: |
- Emotion detection |
- Emotional intensity |
3. Speech Content: |
- Text transcription |
Upload an audio file or record directly through your microphone. |
""", |
examples=None, |
cache_examples=False |
) |
if __name__ == "__main__": |
demo.launch(share=True) |