invincible-jha commited on
Commit
b3d1df8
·
verified ·
1 Parent(s): f420a80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -66
app.py CHANGED
@@ -1,4 +1,7 @@
1
- # Part 1: Essential Imports and Setup
 
 
 
2
  import gradio as gr
3
  import torch
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -10,6 +13,10 @@ import warnings
10
  import os
11
  from scipy.stats import kurtosis, skew
12
  from anthropic import Anthropic
 
 
 
 
13
 
14
  # Suppress warnings for cleaner output
15
  warnings.filterwarnings('ignore')
@@ -21,9 +28,15 @@ emotion_tokenizer = None
21
  emotion_model = None
22
  clinical_analyzer = None
23
 
24
- # Part 2: Model Loading and Initialization
25
  def load_models():
26
- """Load and initialize speech and emotion analysis models."""
 
 
 
 
 
 
 
27
  global processor, whisper_model, emotion_tokenizer, emotion_model
28
 
29
  try:
@@ -48,9 +61,16 @@ def load_models():
48
  print(f"Error loading models: {str(e)}")
49
  return False
50
 
51
- # Part 3: Voice Feature Extraction
52
  def extract_prosodic_features(waveform, sr):
53
- """Extract voice features including pitch, energy, and rhythm patterns."""
 
 
 
 
 
 
 
 
54
  try:
55
  # Input validation
56
  if waveform is None or len(waveform) == 0:
@@ -91,7 +111,7 @@ def extract_prosodic_features(waveform, sr):
91
  print(f"Pitch extraction error: {e}")
92
  features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
93
 
94
- # Energy analysis
95
  try:
96
  rms = librosa.feature.rms(
97
  y=waveform,
@@ -109,7 +129,7 @@ def extract_prosodic_features(waveform, sr):
109
  print(f"Energy extraction error: {e}")
110
  features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
111
 
112
- # Rhythm analysis
113
  try:
114
  onset_env = librosa.onset.onset_strength(
115
  y=waveform,
@@ -125,6 +145,7 @@ def extract_prosodic_features(waveform, sr):
125
  aggregate=None
126
  )[0]
127
 
 
128
  features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
129
 
130
  except Exception as e:
@@ -136,7 +157,6 @@ def extract_prosodic_features(waveform, sr):
136
  print(f"Feature extraction failed: {e}")
137
  return None
138
 
139
- # Part 4: Clinical Analysis Integration
140
  class ClinicalVoiceAnalyzer:
141
  """Analyze voice characteristics for psychological indicators."""
142
 
@@ -144,15 +164,25 @@ class ClinicalVoiceAnalyzer:
144
  """Initialize the clinical analyzer with API and reference ranges."""
145
  self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
146
  self.model = "claude-3-opus-20240229"
 
147
  self.reference_ranges = {
148
- 'pitch': {'min': 150, 'max': 400},
149
- 'tempo': {'min': 90, 'max': 130},
150
  'energy': {'min': 0.01, 'max': 0.05}
151
  }
152
  print("Clinical analyzer ready")
153
 
154
  def analyze_voice_metrics(self, features, emotions, transcription):
155
- """Generate clinical insights from voice and emotion data."""
 
 
 
 
 
 
 
 
 
156
  try:
157
  prompt = self._create_clinical_prompt(features, emotions, transcription)
158
  response = self.anthropic.messages.create(
@@ -208,9 +238,15 @@ Basic Voice Analysis (API Unavailable):
208
  - Voice Energy Level: {features['energy_mean']:.4f}
209
  - Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
210
 
211
- # Part 5: Visualization Functions
212
  def create_feature_plots(features):
213
- """Create interactive visualizations of voice features."""
 
 
 
 
 
 
 
214
  try:
215
  fig = go.Figure()
216
 
@@ -266,7 +302,14 @@ def create_feature_plots(features):
266
  return None
267
 
268
  def create_emotion_plot(emotions):
269
- """Create visualization of emotional analysis."""
 
 
 
 
 
 
 
270
  try:
271
  fig = go.Figure(data=[
272
  go.Bar(
@@ -291,9 +334,18 @@ def create_emotion_plot(emotions):
291
  print(f"Emotion plot error: {e}")
292
  return None
293
 
294
- # Part 6: Main Analysis Function
295
  def analyze_audio(audio_input):
296
- """Process audio input and generate comprehensive analysis."""
 
 
 
 
 
 
 
 
 
 
297
  try:
298
  # Validate input
299
  if audio_input is None:
@@ -365,6 +417,19 @@ Voice Characteristics:
365
  - Speech Rate (Tempo): {features['tempo']:.2f} BPM
366
  - Voice Energy: {features['energy_mean']:.4f}
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
369
  Emotion Confidence: {max(emotion_scores.values()):.2%}
370
 
@@ -379,7 +444,7 @@ Recording Duration: {duration:.2f} seconds
379
  print(error_msg)
380
  return error_msg, None, None
381
 
382
- # Part 7: Application Initialization
383
  try:
384
  print("===== Application Startup =====")
385
 
@@ -391,6 +456,33 @@ try:
391
  clinical_analyzer = ClinicalVoiceAnalyzer()
392
  print("Clinical analyzer initialized")
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  # Create Gradio interface
395
  demo = gr.Interface(
396
  fn=analyze_audio,
@@ -405,59 +497,26 @@ try:
405
  gr.HTML(label="Voice Feature Analysis")
406
  ],
407
  title="Voice Analysis System with Clinical Interpretation",
408
- description="""
409
- This application provides comprehensive voice analysis with clinical insights:
410
-
411
- 1. Voice Features:
412
- - Pitch analysis (fundamental frequency and variation)
413
- - Energy patterns (volume and intensity)
414
- - Speech rate (words per minute)
415
- - Voice quality metrics
416
-
417
- 2. Clinical Analysis:
418
- - Mental health indicators
419
- - Emotional state evaluation
420
- - Risk assessment
421
- - Clinical recommendations
422
-
423
- 3. Emotional Content:
424
- - Emotion detection (6 basic emotions)
425
- - Emotional intensity analysis
426
-
427
- For optimal description="""
428
- This application provides comprehensive voice analysis with clinical insights:
429
-
430
- 1. Voice Features:
431
- - Pitch analysis (fundamental frequency and variation)
432
- - Energy patterns (volume and intensity)
433
- - Speech rate (words per minute)
434
- - Voice quality metrics
435
-
436
- 2. Clinical Analysis:
437
- - Mental health indicators
438
- - Emotional state evaluation
439
- - Risk assessment
440
- - Clinical recommendations
441
-
442
- 3. Emotional Content:
443
- - Emotion detection (6 basic emotions)
444
- - Emotional intensity analysis
445
-
446
- For optimal results:
447
- - Record in a quiet environment
448
- - Speak clearly and naturally
449
- - Keep recordings between 1-5 seconds
450
- - Maintain consistent volume
451
-
452
- Upload an audio file or record directly through your microphone.
453
- """,
454
  examples=None,
455
- cache_examples=False
 
456
  )
457
 
458
- # Launch the interface
459
  if __name__ == "__main__":
460
- demo.launch()
 
 
 
 
 
461
 
462
  except Exception as e:
463
  print(f"Error during application startup: {str(e)}")
 
1
+ # app.py - Voice Analysis System with Clinical Interpretation
2
+ # This application provides comprehensive voice analysis with mental health insights
3
+ # using voice biomarkers, emotion detection, and clinical interpretation.
4
+
5
  import gradio as gr
6
  import torch
7
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
13
  import os
14
  from scipy.stats import kurtosis, skew
15
  from anthropic import Anthropic
16
+ from dotenv import load_dotenv
17
+
18
+ # Load environment variables for API keys
19
+ load_dotenv()
20
 
21
  # Suppress warnings for cleaner output
22
  warnings.filterwarnings('ignore')
 
28
  emotion_model = None
29
  clinical_analyzer = None
30
 
 
31
  def load_models():
32
+ """Load and initialize speech recognition and emotion analysis models.
33
+
34
+ This function handles the initialization of both Whisper (for speech recognition)
35
+ and the emotion detection model, setting them up for CPU-based inference.
36
+
37
+ Returns:
38
+ bool: True if all models loaded successfully, False otherwise
39
+ """
40
  global processor, whisper_model, emotion_tokenizer, emotion_model
41
 
42
  try:
 
61
  print(f"Error loading models: {str(e)}")
62
  return False
63
 
 
64
  def extract_prosodic_features(waveform, sr):
65
+ """Extract voice features including pitch, energy, and rhythm patterns.
66
+
67
+ Args:
68
+ waveform (numpy.ndarray): Audio signal data
69
+ sr (int): Sampling rate of the audio
70
+
71
+ Returns:
72
+ dict: Dictionary containing extracted features or None if extraction fails
73
+ """
74
  try:
75
  # Input validation
76
  if waveform is None or len(waveform) == 0:
 
111
  print(f"Pitch extraction error: {e}")
112
  features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
113
 
114
+ # Energy analysis with noise handling
115
  try:
116
  rms = librosa.feature.rms(
117
  y=waveform,
 
129
  print(f"Energy extraction error: {e}")
130
  features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
131
 
132
+ # Rhythm analysis with tempo validation
133
  try:
134
  onset_env = librosa.onset.onset_strength(
135
  y=waveform,
 
145
  aggregate=None
146
  )[0]
147
 
148
+ # Validate tempo within normal speech range (40-240 BPM)
149
  features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
150
 
151
  except Exception as e:
 
157
  print(f"Feature extraction failed: {e}")
158
  return None
159
 
 
160
  class ClinicalVoiceAnalyzer:
161
  """Analyze voice characteristics for psychological indicators."""
162
 
 
164
  """Initialize the clinical analyzer with API and reference ranges."""
165
  self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
166
  self.model = "claude-3-opus-20240229"
167
+ # Define normal ranges for voice metrics based on clinical research
168
  self.reference_ranges = {
169
+ 'pitch': {'min': 150, 'max': 400}, # Hz
170
+ 'tempo': {'min': 90, 'max': 130}, # BPM
171
  'energy': {'min': 0.01, 'max': 0.05}
172
  }
173
  print("Clinical analyzer ready")
174
 
175
  def analyze_voice_metrics(self, features, emotions, transcription):
176
+ """Generate clinical insights from voice and emotion data.
177
+
178
+ Args:
179
+ features (dict): Extracted voice features
180
+ emotions (dict): Detected emotion scores
181
+ transcription (str): Speech content
182
+
183
+ Returns:
184
+ str: Formatted clinical analysis or backup analysis if API fails
185
+ """
186
  try:
187
  prompt = self._create_clinical_prompt(features, emotions, transcription)
188
  response = self.anthropic.messages.create(
 
238
  - Voice Energy Level: {features['energy_mean']:.4f}
239
  - Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
240
 
 
241
  def create_feature_plots(features):
242
+ """Create interactive visualizations of voice features.
243
+
244
+ Args:
245
+ features (dict): Dictionary of extracted voice features
246
+
247
+ Returns:
248
+ str: HTML representation of the interactive plots
249
+ """
250
  try:
251
  fig = go.Figure()
252
 
 
302
  return None
303
 
304
  def create_emotion_plot(emotions):
305
+ """Create visualization of emotional analysis.
306
+
307
+ Args:
308
+ emotions (dict): Dictionary of emotion scores
309
+
310
+ Returns:
311
+ str: HTML representation of the emotion plot
312
+ """
313
  try:
314
  fig = go.Figure(data=[
315
  go.Bar(
 
334
  print(f"Emotion plot error: {e}")
335
  return None
336
 
 
337
  def analyze_audio(audio_input):
338
+ """Process audio input and generate comprehensive analysis.
339
+
340
+ This is the main function that coordinates the entire analysis pipeline,
341
+ including feature extraction, emotion detection, and clinical interpretation.
342
+
343
+ Args:
344
+ audio_input: Audio file path or tuple containing audio data
345
+
346
+ Returns:
347
+ tuple: (analysis_summary, emotion_visualization, feature_visualization)
348
+ """
349
  try:
350
  # Validate input
351
  if audio_input is None:
 
417
  - Speech Rate (Tempo): {features['tempo']:.2f} BPM
418
  - Voice Energy: {features['energy_mean']:.4f}
419
 
420
+ Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
421
+ Emotion# Continue from previous summary string
422
+ summary = f"""Voice Analysis Summary:
423
+
424
+ Speech Content:
425
+ {transcription}
426
+
427
+ Voice Characteristics:
428
+ - Average Pitch: {features['pitch_mean']:.2f} Hz
429
+ - Pitch Variation: {features['pitch_std']:.2f} Hz
430
+ - Speech Rate (Tempo): {features['tempo']:.2f} BPM
431
+ - Voice Energy: {features['energy_mean']:.4f}
432
+
433
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
434
  Emotion Confidence: {max(emotion_scores.values()):.2%}
435
 
 
444
  print(error_msg)
445
  return error_msg, None, None
446
 
447
+ # Application initialization and Gradio interface setup
448
  try:
449
  print("===== Application Startup =====")
450
 
 
456
  clinical_analyzer = ClinicalVoiceAnalyzer()
457
  print("Clinical analyzer initialized")
458
 
459
+ # Define the interface description
460
+ description = """This application provides comprehensive voice analysis with clinical insights:
461
+
462
+ 1. Voice Features:
463
+ - Pitch analysis (fundamental frequency and variation)
464
+ - Energy patterns (volume and intensity)
465
+ - Speech rate (words per minute)
466
+ - Voice quality metrics
467
+
468
+ 2. Clinical Analysis:
469
+ - Mental health indicators
470
+ - Emotional state evaluation
471
+ - Risk assessment
472
+ - Clinical recommendations
473
+
474
+ 3. Emotional Content:
475
+ - Emotion detection (6 basic emotions)
476
+ - Emotional intensity analysis
477
+
478
+ For optimal results:
479
+ - Record in a quiet environment
480
+ - Speak clearly and naturally
481
+ - Keep recordings between 1-5 seconds
482
+ - Maintain consistent volume
483
+
484
+ Upload an audio file or record directly through your microphone."""
485
+
486
  # Create Gradio interface
487
  demo = gr.Interface(
488
  fn=analyze_audio,
 
497
  gr.HTML(label="Voice Feature Analysis")
498
  ],
499
  title="Voice Analysis System with Clinical Interpretation",
500
+ description=description,
501
+ article="""This system uses advanced AI models to analyze voice patterns and provide mental health insights.
502
+ The analysis combines speech recognition, emotion detection, and clinical interpretation to offer
503
+ a comprehensive understanding of psychological indicators present in voice characteristics.
504
+
505
+ Note: This tool is for informational purposes only and should not be used as a substitute for
506
+ professional medical advice, diagnosis, or treatment.""",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  examples=None,
508
+ cache_examples=False,
509
+ theme="default"
510
  )
511
 
512
+ # Launch the interface with additional configuration
513
  if __name__ == "__main__":
514
+ demo.launch(
515
+ server_name="0.0.0.0", # Allow external access
516
+ server_port=7860, # Default Gradio port
517
+ share=False, # Disable public URL generation
518
+ debug=False # Disable debug mode in production
519
+ )
520
 
521
  except Exception as e:
522
  print(f"Error during application startup: {str(e)}")