invincible-jha's picture
Update app.py
e666e44 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
import librosa
import numpy as np
import plotly.graph_objects as go
import warnings
import os
from scipy.stats import kurtosis, skew
warnings.filterwarnings('ignore')
def extract_prosodic_features(waveform, sr):
"""Extract prosodic features from audio"""
try:
features = {}
# 1. Pitch (F0) Features
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
f0_contour = []
for t in range(pitches.shape[1]):
pitches_at_t = pitches[:, t]
mags = magnitudes[:, t]
pitch_index = mags.argmax()
f0_contour.append(pitches[pitch_index, t])
f0_contour = np.array(f0_contour)
f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
if len(f0_contour) > 0:
features['pitch_mean'] = np.mean(f0_contour)
features['pitch_std'] = np.std(f0_contour)
features['pitch_range'] = np.ptp(f0_contour)
else:
features['pitch_mean'] = 0
features['pitch_std'] = 0
features['pitch_range'] = 0
# 2. Energy/Intensity Features
rms = librosa.feature.rms(y=waveform)[0]
features['energy_mean'] = np.mean(rms)
features['energy_std'] = np.std(rms)
features['energy_range'] = np.ptp(rms)
# 3. Rhythm Features
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
features['tempo'] = tempo[0]
# 4. Voice Quality Features
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
# 5. MFCC Features
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
for i in range(13):
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
features[f'mfcc_{i}_std'] = np.std(mfccs[i])
return features
except Exception as e:
print(f"Error in extract_prosodic_features: {str(e)}")
return None
def create_feature_plots(features):
"""Create visualizations for audio features"""
try:
# Create main figure with subplots
fig = go.Figure()
# 1. Pitch Features
pitch_data = {
'Mean': features['pitch_mean'],
'Std Dev': features['pitch_std'],
'Range': features['pitch_range']
}
fig.add_trace(go.Bar(
name='Pitch Features',
x=list(pitch_data.keys()),
y=list(pitch_data.values()),
marker_color='blue'
))
# 2. Energy Features
energy_data = {
'Mean': features['energy_mean'],
'Std Dev': features['energy_std'],
'Range': features['energy_range']
}
fig.add_trace(go.Bar(
name='Energy Features',
x=[f"Energy {k}" for k in energy_data.keys()],
y=list(energy_data.values()),
marker_color='red'
))
# 3. MFCC Plot
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
fig.add_trace(go.Scatter(
name='MFCC Coefficients',
y=mfcc_means,
mode='lines+markers',
marker_color='green'
))
# Update layout
fig.update_layout(
title='Voice Feature Analysis',
showlegend=True,
height=600,
barmode='group'
)
return fig.to_html(include_plotlyjs=True)
except Exception as e:
print(f"Error in create_feature_plots: {str(e)}")
return None
def load_models():
"""Initialize and load all required models"""
global processor, whisper_model, emotion_tokenizer, emotion_model
try:
print("Loading Whisper model...")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
print("Loading emotion model...")
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
whisper_model.to("cpu")
emotion_model.to("cpu")
print("Models loaded successfully!")
return True
except Exception as e:
print(f"Error loading models: {str(e)}")
return False
def create_emotion_plot(emotions):
"""Create emotion analysis visualization"""
try:
fig = go.Figure(data=[
go.Bar(
x=list(emotions.keys()),
y=list(emotions.values()),
marker_color='rgb(55, 83, 109)'
)
])
fig.update_layout(
title='Emotion Analysis',
xaxis_title='Emotion',
yaxis_title='Score',
yaxis_range=[0, 1],
template='plotly_white',
height=400
)
return fig.to_html(include_plotlyjs=True)
except Exception as e:
print(f"Error creating emotion plot: {str(e)}")
return None
def analyze_audio(audio_input):
"""Main function to analyze audio input"""
try:
if audio_input is None:
return "Please provide an audio input", None, None
print(f"Processing audio input: {type(audio_input)}")
# Handle audio input
if isinstance(audio_input, tuple):
audio_path = audio_input[0] # Get file path from tuple
else:
audio_path = audio_input
print(f"Loading audio from path: {audio_path}")
# Load audio
waveform, sr = librosa.load(audio_path, sr=16000)
print(f"Audio loaded: {waveform.shape}, SR: {sr}")
# Extract voice features
print("Extracting voice features...")
features = extract_prosodic_features(waveform, sr)
if features is None:
return "Error extracting voice features", None, None
# Create feature plots
print("Creating feature visualizations...")
feature_viz = create_feature_plots(features)
if feature_viz is None:
return "Error creating feature visualizations", None, None
# Transcribe audio
print("Transcribing audio...")
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
with torch.no_grad():
predicted_ids = whisper_model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Analyze emotions
print("Analyzing emotions...")
emotion_inputs = emotion_tokenizer(
transcription,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
with torch.no_grad():
emotion_outputs = emotion_model(**emotion_inputs)
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
emotion_scores = {
label: float(score)
for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
}
# Create emotion visualization
emotion_viz = create_emotion_plot(emotion_scores)
if emotion_viz is None:
return "Error creating emotion visualization", None, None
# Create analysis summary
summary = f"""Voice Analysis Summary:
Speech Content:
{transcription}
Voice Characteristics:
- Average Pitch: {features['pitch_mean']:.2f} Hz
- Pitch Variation: {features['pitch_std']:.2f} Hz
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
- Voice Energy: {features['energy_mean']:.4f}
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
"""
return summary, emotion_viz, feature_viz
except Exception as e:
error_msg = f"Error in audio analysis: {str(e)}"
print(error_msg)
return error_msg, None, None
# Load models at startup
print("Initializing application...")
if not load_models():
raise RuntimeError("Failed to load required models")
# Create Gradio interface
demo = gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Audio Input"
),
outputs=[
gr.Textbox(label="Analysis Summary", lines=10),
gr.HTML(label="Emotion Analysis"),
gr.HTML(label="Voice Feature Analysis")
],
title="Voice Analysis System",
description="""
This application analyzes voice recordings to extract various characteristics:
1. Voice Features:
- Pitch analysis
- Energy patterns
- Speech rate
- Voice quality
2. Emotional Content:
- Emotion detection
- Emotional intensity
3. Speech Content:
- Text transcription
Upload an audio file or record directly through your microphone.
""",
examples=None,
cache_examples=False
)
if __name__ == "__main__":
demo.launch(share=True)