invincible-jha
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
#
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
@@ -10,6 +13,10 @@ import warnings
|
|
10 |
import os
|
11 |
from scipy.stats import kurtosis, skew
|
12 |
from anthropic import Anthropic
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Suppress warnings for cleaner output
|
15 |
warnings.filterwarnings('ignore')
|
@@ -21,9 +28,15 @@ emotion_tokenizer = None
|
|
21 |
emotion_model = None
|
22 |
clinical_analyzer = None
|
23 |
|
24 |
-
# Part 2: Model Loading and Initialization
|
25 |
def load_models():
|
26 |
-
"""Load and initialize speech and emotion analysis models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
global processor, whisper_model, emotion_tokenizer, emotion_model
|
28 |
|
29 |
try:
|
@@ -48,9 +61,16 @@ def load_models():
|
|
48 |
print(f"Error loading models: {str(e)}")
|
49 |
return False
|
50 |
|
51 |
-
# Part 3: Voice Feature Extraction
|
52 |
def extract_prosodic_features(waveform, sr):
|
53 |
-
"""Extract voice features including pitch, energy, and rhythm patterns.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
try:
|
55 |
# Input validation
|
56 |
if waveform is None or len(waveform) == 0:
|
@@ -91,7 +111,7 @@ def extract_prosodic_features(waveform, sr):
|
|
91 |
print(f"Pitch extraction error: {e}")
|
92 |
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
|
93 |
|
94 |
-
# Energy analysis
|
95 |
try:
|
96 |
rms = librosa.feature.rms(
|
97 |
y=waveform,
|
@@ -109,7 +129,7 @@ def extract_prosodic_features(waveform, sr):
|
|
109 |
print(f"Energy extraction error: {e}")
|
110 |
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
|
111 |
|
112 |
-
# Rhythm analysis
|
113 |
try:
|
114 |
onset_env = librosa.onset.onset_strength(
|
115 |
y=waveform,
|
@@ -125,6 +145,7 @@ def extract_prosodic_features(waveform, sr):
|
|
125 |
aggregate=None
|
126 |
)[0]
|
127 |
|
|
|
128 |
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
|
129 |
|
130 |
except Exception as e:
|
@@ -136,7 +157,6 @@ def extract_prosodic_features(waveform, sr):
|
|
136 |
print(f"Feature extraction failed: {e}")
|
137 |
return None
|
138 |
|
139 |
-
# Part 4: Clinical Analysis Integration
|
140 |
class ClinicalVoiceAnalyzer:
|
141 |
"""Analyze voice characteristics for psychological indicators."""
|
142 |
|
@@ -144,15 +164,25 @@ class ClinicalVoiceAnalyzer:
|
|
144 |
"""Initialize the clinical analyzer with API and reference ranges."""
|
145 |
self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
|
146 |
self.model = "claude-3-opus-20240229"
|
|
|
147 |
self.reference_ranges = {
|
148 |
-
'pitch': {'min': 150, 'max': 400},
|
149 |
-
'tempo': {'min': 90, 'max': 130},
|
150 |
'energy': {'min': 0.01, 'max': 0.05}
|
151 |
}
|
152 |
print("Clinical analyzer ready")
|
153 |
|
154 |
def analyze_voice_metrics(self, features, emotions, transcription):
|
155 |
-
"""Generate clinical insights from voice and emotion data.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
try:
|
157 |
prompt = self._create_clinical_prompt(features, emotions, transcription)
|
158 |
response = self.anthropic.messages.create(
|
@@ -208,9 +238,15 @@ Basic Voice Analysis (API Unavailable):
|
|
208 |
- Voice Energy Level: {features['energy_mean']:.4f}
|
209 |
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
|
210 |
|
211 |
-
# Part 5: Visualization Functions
|
212 |
def create_feature_plots(features):
|
213 |
-
"""Create interactive visualizations of voice features.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
try:
|
215 |
fig = go.Figure()
|
216 |
|
@@ -266,7 +302,14 @@ def create_feature_plots(features):
|
|
266 |
return None
|
267 |
|
268 |
def create_emotion_plot(emotions):
|
269 |
-
"""Create visualization of emotional analysis.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
try:
|
271 |
fig = go.Figure(data=[
|
272 |
go.Bar(
|
@@ -291,9 +334,18 @@ def create_emotion_plot(emotions):
|
|
291 |
print(f"Emotion plot error: {e}")
|
292 |
return None
|
293 |
|
294 |
-
# Part 6: Main Analysis Function
|
295 |
def analyze_audio(audio_input):
|
296 |
-
"""Process audio input and generate comprehensive analysis.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
try:
|
298 |
# Validate input
|
299 |
if audio_input is None:
|
@@ -365,6 +417,19 @@ Voice Characteristics:
|
|
365 |
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
|
366 |
- Voice Energy: {features['energy_mean']:.4f}
|
367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
369 |
Emotion Confidence: {max(emotion_scores.values()):.2%}
|
370 |
|
@@ -379,7 +444,7 @@ Recording Duration: {duration:.2f} seconds
|
|
379 |
print(error_msg)
|
380 |
return error_msg, None, None
|
381 |
|
382 |
-
#
|
383 |
try:
|
384 |
print("===== Application Startup =====")
|
385 |
|
@@ -391,6 +456,33 @@ try:
|
|
391 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
392 |
print("Clinical analyzer initialized")
|
393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
# Create Gradio interface
|
395 |
demo = gr.Interface(
|
396 |
fn=analyze_audio,
|
@@ -405,59 +497,26 @@ try:
|
|
405 |
gr.HTML(label="Voice Feature Analysis")
|
406 |
],
|
407 |
title="Voice Analysis System with Clinical Interpretation",
|
408 |
-
description=
|
409 |
-
This
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
- Voice quality metrics
|
416 |
-
|
417 |
-
2. Clinical Analysis:
|
418 |
-
- Mental health indicators
|
419 |
-
- Emotional state evaluation
|
420 |
-
- Risk assessment
|
421 |
-
- Clinical recommendations
|
422 |
-
|
423 |
-
3. Emotional Content:
|
424 |
-
- Emotion detection (6 basic emotions)
|
425 |
-
- Emotional intensity analysis
|
426 |
-
|
427 |
-
For optimal description="""
|
428 |
-
This application provides comprehensive voice analysis with clinical insights:
|
429 |
-
|
430 |
-
1. Voice Features:
|
431 |
-
- Pitch analysis (fundamental frequency and variation)
|
432 |
-
- Energy patterns (volume and intensity)
|
433 |
-
- Speech rate (words per minute)
|
434 |
-
- Voice quality metrics
|
435 |
-
|
436 |
-
2. Clinical Analysis:
|
437 |
-
- Mental health indicators
|
438 |
-
- Emotional state evaluation
|
439 |
-
- Risk assessment
|
440 |
-
- Clinical recommendations
|
441 |
-
|
442 |
-
3. Emotional Content:
|
443 |
-
- Emotion detection (6 basic emotions)
|
444 |
-
- Emotional intensity analysis
|
445 |
-
|
446 |
-
For optimal results:
|
447 |
-
- Record in a quiet environment
|
448 |
-
- Speak clearly and naturally
|
449 |
-
- Keep recordings between 1-5 seconds
|
450 |
-
- Maintain consistent volume
|
451 |
-
|
452 |
-
Upload an audio file or record directly through your microphone.
|
453 |
-
""",
|
454 |
examples=None,
|
455 |
-
cache_examples=False
|
|
|
456 |
)
|
457 |
|
458 |
-
# Launch the interface
|
459 |
if __name__ == "__main__":
|
460 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
461 |
|
462 |
except Exception as e:
|
463 |
print(f"Error during application startup: {str(e)}")
|
|
|
1 |
+
# app.py - Voice Analysis System with Clinical Interpretation
|
2 |
+
# This application provides comprehensive voice analysis with mental health insights
|
3 |
+
# using voice biomarkers, emotion detection, and clinical interpretation.
|
4 |
+
|
5 |
import gradio as gr
|
6 |
import torch
|
7 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
13 |
import os
|
14 |
from scipy.stats import kurtosis, skew
|
15 |
from anthropic import Anthropic
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
# Load environment variables for API keys
|
19 |
+
load_dotenv()
|
20 |
|
21 |
# Suppress warnings for cleaner output
|
22 |
warnings.filterwarnings('ignore')
|
|
|
28 |
emotion_model = None
|
29 |
clinical_analyzer = None
|
30 |
|
|
|
31 |
def load_models():
|
32 |
+
"""Load and initialize speech recognition and emotion analysis models.
|
33 |
+
|
34 |
+
This function handles the initialization of both Whisper (for speech recognition)
|
35 |
+
and the emotion detection model, setting them up for CPU-based inference.
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
bool: True if all models loaded successfully, False otherwise
|
39 |
+
"""
|
40 |
global processor, whisper_model, emotion_tokenizer, emotion_model
|
41 |
|
42 |
try:
|
|
|
61 |
print(f"Error loading models: {str(e)}")
|
62 |
return False
|
63 |
|
|
|
64 |
def extract_prosodic_features(waveform, sr):
|
65 |
+
"""Extract voice features including pitch, energy, and rhythm patterns.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
waveform (numpy.ndarray): Audio signal data
|
69 |
+
sr (int): Sampling rate of the audio
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
dict: Dictionary containing extracted features or None if extraction fails
|
73 |
+
"""
|
74 |
try:
|
75 |
# Input validation
|
76 |
if waveform is None or len(waveform) == 0:
|
|
|
111 |
print(f"Pitch extraction error: {e}")
|
112 |
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
|
113 |
|
114 |
+
# Energy analysis with noise handling
|
115 |
try:
|
116 |
rms = librosa.feature.rms(
|
117 |
y=waveform,
|
|
|
129 |
print(f"Energy extraction error: {e}")
|
130 |
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
|
131 |
|
132 |
+
# Rhythm analysis with tempo validation
|
133 |
try:
|
134 |
onset_env = librosa.onset.onset_strength(
|
135 |
y=waveform,
|
|
|
145 |
aggregate=None
|
146 |
)[0]
|
147 |
|
148 |
+
# Validate tempo within normal speech range (40-240 BPM)
|
149 |
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
|
150 |
|
151 |
except Exception as e:
|
|
|
157 |
print(f"Feature extraction failed: {e}")
|
158 |
return None
|
159 |
|
|
|
160 |
class ClinicalVoiceAnalyzer:
|
161 |
"""Analyze voice characteristics for psychological indicators."""
|
162 |
|
|
|
164 |
"""Initialize the clinical analyzer with API and reference ranges."""
|
165 |
self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
|
166 |
self.model = "claude-3-opus-20240229"
|
167 |
+
# Define normal ranges for voice metrics based on clinical research
|
168 |
self.reference_ranges = {
|
169 |
+
'pitch': {'min': 150, 'max': 400}, # Hz
|
170 |
+
'tempo': {'min': 90, 'max': 130}, # BPM
|
171 |
'energy': {'min': 0.01, 'max': 0.05}
|
172 |
}
|
173 |
print("Clinical analyzer ready")
|
174 |
|
175 |
def analyze_voice_metrics(self, features, emotions, transcription):
|
176 |
+
"""Generate clinical insights from voice and emotion data.
|
177 |
+
|
178 |
+
Args:
|
179 |
+
features (dict): Extracted voice features
|
180 |
+
emotions (dict): Detected emotion scores
|
181 |
+
transcription (str): Speech content
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
str: Formatted clinical analysis or backup analysis if API fails
|
185 |
+
"""
|
186 |
try:
|
187 |
prompt = self._create_clinical_prompt(features, emotions, transcription)
|
188 |
response = self.anthropic.messages.create(
|
|
|
238 |
- Voice Energy Level: {features['energy_mean']:.4f}
|
239 |
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
|
240 |
|
|
|
241 |
def create_feature_plots(features):
|
242 |
+
"""Create interactive visualizations of voice features.
|
243 |
+
|
244 |
+
Args:
|
245 |
+
features (dict): Dictionary of extracted voice features
|
246 |
+
|
247 |
+
Returns:
|
248 |
+
str: HTML representation of the interactive plots
|
249 |
+
"""
|
250 |
try:
|
251 |
fig = go.Figure()
|
252 |
|
|
|
302 |
return None
|
303 |
|
304 |
def create_emotion_plot(emotions):
|
305 |
+
"""Create visualization of emotional analysis.
|
306 |
+
|
307 |
+
Args:
|
308 |
+
emotions (dict): Dictionary of emotion scores
|
309 |
+
|
310 |
+
Returns:
|
311 |
+
str: HTML representation of the emotion plot
|
312 |
+
"""
|
313 |
try:
|
314 |
fig = go.Figure(data=[
|
315 |
go.Bar(
|
|
|
334 |
print(f"Emotion plot error: {e}")
|
335 |
return None
|
336 |
|
|
|
337 |
def analyze_audio(audio_input):
|
338 |
+
"""Process audio input and generate comprehensive analysis.
|
339 |
+
|
340 |
+
This is the main function that coordinates the entire analysis pipeline,
|
341 |
+
including feature extraction, emotion detection, and clinical interpretation.
|
342 |
+
|
343 |
+
Args:
|
344 |
+
audio_input: Audio file path or tuple containing audio data
|
345 |
+
|
346 |
+
Returns:
|
347 |
+
tuple: (analysis_summary, emotion_visualization, feature_visualization)
|
348 |
+
"""
|
349 |
try:
|
350 |
# Validate input
|
351 |
if audio_input is None:
|
|
|
417 |
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
|
418 |
- Voice Energy: {features['energy_mean']:.4f}
|
419 |
|
420 |
+
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
421 |
+
Emotion# Continue from previous summary string
|
422 |
+
summary = f"""Voice Analysis Summary:
|
423 |
+
|
424 |
+
Speech Content:
|
425 |
+
{transcription}
|
426 |
+
|
427 |
+
Voice Characteristics:
|
428 |
+
- Average Pitch: {features['pitch_mean']:.2f} Hz
|
429 |
+
- Pitch Variation: {features['pitch_std']:.2f} Hz
|
430 |
+
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
|
431 |
+
- Voice Energy: {features['energy_mean']:.4f}
|
432 |
+
|
433 |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
434 |
Emotion Confidence: {max(emotion_scores.values()):.2%}
|
435 |
|
|
|
444 |
print(error_msg)
|
445 |
return error_msg, None, None
|
446 |
|
447 |
+
# Application initialization and Gradio interface setup
|
448 |
try:
|
449 |
print("===== Application Startup =====")
|
450 |
|
|
|
456 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
457 |
print("Clinical analyzer initialized")
|
458 |
|
459 |
+
# Define the interface description
|
460 |
+
description = """This application provides comprehensive voice analysis with clinical insights:
|
461 |
+
|
462 |
+
1. Voice Features:
|
463 |
+
- Pitch analysis (fundamental frequency and variation)
|
464 |
+
- Energy patterns (volume and intensity)
|
465 |
+
- Speech rate (words per minute)
|
466 |
+
- Voice quality metrics
|
467 |
+
|
468 |
+
2. Clinical Analysis:
|
469 |
+
- Mental health indicators
|
470 |
+
- Emotional state evaluation
|
471 |
+
- Risk assessment
|
472 |
+
- Clinical recommendations
|
473 |
+
|
474 |
+
3. Emotional Content:
|
475 |
+
- Emotion detection (6 basic emotions)
|
476 |
+
- Emotional intensity analysis
|
477 |
+
|
478 |
+
For optimal results:
|
479 |
+
- Record in a quiet environment
|
480 |
+
- Speak clearly and naturally
|
481 |
+
- Keep recordings between 1-5 seconds
|
482 |
+
- Maintain consistent volume
|
483 |
+
|
484 |
+
Upload an audio file or record directly through your microphone."""
|
485 |
+
|
486 |
# Create Gradio interface
|
487 |
demo = gr.Interface(
|
488 |
fn=analyze_audio,
|
|
|
497 |
gr.HTML(label="Voice Feature Analysis")
|
498 |
],
|
499 |
title="Voice Analysis System with Clinical Interpretation",
|
500 |
+
description=description,
|
501 |
+
article="""This system uses advanced AI models to analyze voice patterns and provide mental health insights.
|
502 |
+
The analysis combines speech recognition, emotion detection, and clinical interpretation to offer
|
503 |
+
a comprehensive understanding of psychological indicators present in voice characteristics.
|
504 |
+
|
505 |
+
Note: This tool is for informational purposes only and should not be used as a substitute for
|
506 |
+
professional medical advice, diagnosis, or treatment.""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
examples=None,
|
508 |
+
cache_examples=False,
|
509 |
+
theme="default"
|
510 |
)
|
511 |
|
512 |
+
# Launch the interface with additional configuration
|
513 |
if __name__ == "__main__":
|
514 |
+
demo.launch(
|
515 |
+
server_name="0.0.0.0", # Allow external access
|
516 |
+
server_port=7860, # Default Gradio port
|
517 |
+
share=False, # Disable public URL generation
|
518 |
+
debug=False # Disable debug mode in production
|
519 |
+
)
|
520 |
|
521 |
except Exception as e:
|
522 |
print(f"Error during application startup: {str(e)}")
|