Spaces:

invincible-jha
/

MentalHealthVocalBiomarkers

Sleeping

App Files Files Community

invincible-jha commited on Nov 27, 2024

Commit

f420a80

verified ·

1 Parent(s): beebdff

Update app.py

Browse files

Files changed (1) hide show

app.py +333 -72

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Import necessary libraries for the voice analysis system
 import gradio as gr
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -10,58 +10,138 @@ import warnings
 import os
 from scipy.stats import kurtosis, skew
 from anthropic import Anthropic
-from typing import Dict, Optional, Tuple
-# Suppress unnecessary warnings for cleaner output
 warnings.filterwarnings('ignore')
-# Initialize global variables for model storage
 processor = None
 whisper_model = None
 emotion_tokenizer = None
 emotion_model = None
 clinical_analyzer = None
 def load_models():
-    """Initialize and load all required machine learning models.
-    This function handles the loading of both the Whisper speech recognition model
-    and the emotion detection model. It includes proper error handling and
-    device management for optimal performance.
-    Returns:
-        bool: True if all models loaded successfully, False otherwise
-    """
     global processor, whisper_model, emotion_tokenizer, emotion_model
     try:
-        # Load the Whisper model for speech recognition
         print("Loading Whisper model...")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        # Load the emotion detection model
         print("Loading emotion model...")
         emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
         emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
-        # Move models to CPU for consistent performance
         device = "cpu"
         whisper_model.to(device)
         emotion_model.to(device)
         print("Models loaded successfully!")
         return True
     except Exception as e:
         print(f"Error loading models: {str(e)}")
         return False
 class ClinicalVoiceAnalyzer:
-    """Clinical voice analysis system using Anthropic's Claude for interpretation."""
     def __init__(self):
-        """Initialize the clinical analyzer with reference ranges and API client."""
         self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
         self.model = "claude-3-opus-20240229"
         self.reference_ranges = {
@@ -69,86 +149,243 @@ class ClinicalVoiceAnalyzer:
             'tempo': {'min': 90, 'max': 130},
             'energy': {'min': 0.01, 'max': 0.05}
         }
-        print("Clinical analyzer initialized successfully")
-    def analyze_voice_metrics(self, features: Dict, emotions: Dict, transcription: str) -> str:
-        """Perform comprehensive clinical analysis of voice characteristics."""
         try:
-            prompt = self._create_analysis_prompt(features, emotions, transcription)
             response = self.anthropic.messages.create(
                 model=self.model,
                 max_tokens=1000,
-                messages=[{
-                    "role": "user",
-                    "content": prompt
-                }]
             )
-            return self._format_clinical_response(response.content)
         except Exception as e:
-            print(f"Error in clinical analysis: {str(e)}")
-            return self._generate_fallback_analysis(features, emotions)
-    def _create_analysis_prompt(self, features: Dict, emotions: Dict, transcription: str) -> str:
-        """Create a detailed prompt for clinical analysis."""
-        return f"""As a clinical voice analysis expert specializing in mental health assessment,
-        provide a detailed psychological evaluation based on the following data:
-Voice Characteristics:
-- Pitch: {features['pitch_mean']:.2f} Hz (Normal range: {self.reference_ranges['pitch']['min']}-{self.reference_ranges['pitch']['max']} Hz)
 - Pitch Variation: {features['pitch_std']:.2f} Hz
-- Speech Rate: {features['tempo']:.2f} BPM (Normal range: {self.reference_ranges['tempo']['min']}-{self.reference_ranges['tempo']['max']} BPM)
-- Voice Energy: {features['energy_mean']:.4f} (Normal range: {self.reference_ranges['energy']['min']}-{self.reference_ranges['energy']['max']})
-Emotional Analysis:
 {', '.join(f'{emotion}: {score:.1%}' for emotion, score in emotions.items())}
 Speech Content:
 "{transcription}"
-Please provide a comprehensive clinical assessment including:
-1. Analysis of voice characteristics and their psychological significance
-2. Emotional state evaluation and potential underlying patterns
-3. Assessment of anxiety and depression indicators
 4. Stress level evaluation
-5. Clinical recommendations and considerations
-Present the analysis in clear sections with specific observations and clinical insights."""
-    def _format_clinical_response(self, analysis: str) -> str:
-        """Format the clinical analysis for clear presentation."""
-        return f"""
-Clinical Analysis:
-{analysis}
-"""
-    def _generate_fallback_analysis(self, features: Dict, emotions: Dict) -> str:
         """Generate basic analysis when API is unavailable."""
-        dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0]
-        pitch_status = "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] else \
-                      "reduced" if features['pitch_mean'] < self.reference_ranges['pitch']['min'] else "normal"
         return f"""
-Basic Clinical Analysis (API Unavailable):
-Voice Pattern Analysis:
-- Pitch is {pitch_status} ({features['pitch_mean']:.2f} Hz)
-- Speech rate shows {features['tempo']:.2f} BPM
-- Voice energy indicates {features['energy_mean']:.4f} level
-Emotional Indication:
-- Primary emotional tone: {dominant_emotion}
-- Confidence: {max(emotions.values()):.1%}
-Note: This is a basic analysis. For detailed clinical interpretation, please ensure API connectivity.
-"""
-[Rest of your existing code for extract_prosodic_features, create_feature_plots, create_emotion_plot, and analyze_audio functions...]
-# Initialize the application with clinical analysis capability
 try:
     print("===== Application Startup =====")
     if not load_models():
-        raise RuntimeError("Failed to load required models")
     # Initialize clinical analyzer
     clinical_analyzer = ClinicalVoiceAnalyzer()
@@ -167,9 +404,28 @@ try:
             gr.HTML(label="Emotion Analysis"),
             gr.HTML(label="Voice Feature Analysis")
         ],
-        title="Advanced Voice Analysis System with Clinical Interpretation",
         description="""
-        This application provides comprehensive voice analysis with clinical interpretation:
         1. Voice Features:
            - Pitch analysis (fundamental frequency and variation)
@@ -192,9 +448,14 @@ try:
         - Speak clearly and naturally
         - Keep recordings between 1-5 seconds
         - Maintain consistent volume
-        """
     )
     if __name__ == "__main__":
         demo.launch()

+# Part 1: Essential Imports and Setup
 import gradio as gr
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import os
 from scipy.stats import kurtosis, skew
 from anthropic import Anthropic
+# Suppress warnings for cleaner output
 warnings.filterwarnings('ignore')
+# Initialize global model variables
 processor = None
 whisper_model = None
 emotion_tokenizer = None
 emotion_model = None
 clinical_analyzer = None
+# Part 2: Model Loading and Initialization
 def load_models():
+    """Load and initialize speech and emotion analysis models."""
     global processor, whisper_model, emotion_tokenizer, emotion_model
     try:
+        # Initialize speech recognition (Whisper) model
         print("Loading Whisper model...")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        # Initialize emotion detection model
         print("Loading emotion model...")
         emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
         emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
+        # Set models to CPU for consistent performance
         device = "cpu"
         whisper_model.to(device)
         emotion_model.to(device)
         print("Models loaded successfully!")
         return True
     except Exception as e:
         print(f"Error loading models: {str(e)}")
         return False
+# Part 3: Voice Feature Extraction
+def extract_prosodic_features(waveform, sr):
+    """Extract voice features including pitch, energy, and rhythm patterns."""
+    try:
+        # Input validation
+        if waveform is None or len(waveform) == 0:
+            return None
+        features = {}
+        # Pitch analysis with enhanced accuracy
+        try:
+            pitches, magnitudes = librosa.piptrack(
+                y=waveform,
+                sr=sr,
+                fmin=50,      # Minimum human voice frequency
+                fmax=2000,    # Maximum human voice frequency
+                n_mels=128,   # Frequency resolution
+                hop_length=512,
+                win_length=2048
+            )
+            # Extract valid pitch contour
+            f0_contour = [
+                pitches[magnitudes[:, t].argmax(), t]
+                for t in range(pitches.shape[1])
+                if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000
+            ]
+            # Calculate pitch statistics
+            if f0_contour:
+                features['pitch_mean'] = float(np.mean(f0_contour))
+                features['pitch_std'] = float(np.std(f0_contour))
+                features['pitch_range'] = float(np.ptp(f0_contour))
+            else:
+                features['pitch_mean'] = 160.0  # Default adult pitch
+                features['pitch_std'] = 0.0
+                features['pitch_range'] = 0.0
+        except Exception as e:
+            print(f"Pitch extraction error: {e}")
+            features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
+        # Energy analysis
+        try:
+            rms = librosa.feature.rms(
+                y=waveform,
+                frame_length=2048,
+                hop_length=512,
+                center=True
+            )[0]
+            features.update({
+                'energy_mean': float(np.mean(rms)),
+                'energy_std': float(np.std(rms)),
+                'energy_range': float(np.ptp(rms))
+            })
+        except Exception as e:
+            print(f"Energy extraction error: {e}")
+            features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
+        # Rhythm analysis
+        try:
+            onset_env = librosa.onset.onset_strength(
+                y=waveform,
+                sr=sr,
+                hop_length=512,
+                aggregate=np.median
+            )
+            tempo = librosa.beat.tempo(
+                onset_envelope=onset_env,
+                sr=sr,
+                hop_length=512,
+                aggregate=None
+            )[0]
+            features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
+        except Exception as e:
+            print(f"Rhythm extraction error: {e}")
+            features['tempo'] = 120.0
+        return features
+    except Exception as e:
+        print(f"Feature extraction failed: {e}")
+        return None
+# Part 4: Clinical Analysis Integration
 class ClinicalVoiceAnalyzer:
+    """Analyze voice characteristics for psychological indicators."""
     def __init__(self):
+        """Initialize the clinical analyzer with API and reference ranges."""
         self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
         self.model = "claude-3-opus-20240229"
         self.reference_ranges = {
             'tempo': {'min': 90, 'max': 130},
             'energy': {'min': 0.01, 'max': 0.05}
         }
+        print("Clinical analyzer ready")
+    def analyze_voice_metrics(self, features, emotions, transcription):
+        """Generate clinical insights from voice and emotion data."""
         try:
+            prompt = self._create_clinical_prompt(features, emotions, transcription)
             response = self.anthropic.messages.create(
                 model=self.model,
                 max_tokens=1000,
+                messages=[{"role": "user", "content": prompt}]
             )
+            return self._format_analysis(response.content)
         except Exception as e:
+            print(f"Clinical analysis error: {e}")
+            return self._generate_backup_analysis(features, emotions)
+    def _create_clinical_prompt(self, features, emotions, transcription):
+        """Create detailed prompt for clinical analysis."""
+        return f"""As a clinical voice analysis expert, provide a psychological assessment of:
+Voice Metrics:
+- Pitch: {features['pitch_mean']:.2f} Hz (Normal: {self.reference_ranges['pitch']['min']}-{self.reference_ranges['pitch']['max']} Hz)
 - Pitch Variation: {features['pitch_std']:.2f} Hz
+- Speech Rate: {features['tempo']:.2f} BPM (Normal: {self.reference_ranges['tempo']['min']}-{self.reference_ranges['tempo']['max']} BPM)
+- Voice Energy: {features['energy_mean']:.4f}
+Emotions Detected:
 {', '.join(f'{emotion}: {score:.1%}' for emotion, score in emotions.items())}
 Speech Content:
 "{transcription}"
+Provide:
+1. Voice characteristic analysis
+2. Emotional state assessment
+3. Anxiety/depression indicators
 4. Stress level evaluation
+5. Clinical recommendations"""
+    def _format_analysis(self, analysis):
+        """Format the clinical analysis output."""
+        return f"\nClinical Assessment:\n{analysis}"
+    def _generate_backup_analysis(self, features, emotions):
         """Generate basic analysis when API is unavailable."""
+        dominant_emotion = max(emotions.items(), key=lambda x: x[1])
+        pitch_status = (
+            "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max']
+            else "reduced" if features['pitch_mean'] < self.reference_ranges['pitch']['min']
+            else "normal"
+        )
         return f"""
+Basic Voice Analysis (API Unavailable):
+- Pitch Status: {pitch_status} ({features['pitch_mean']:.2f} Hz)
+- Speech Rate: {features['tempo']:.2f} BPM
+- Voice Energy Level: {features['energy_mean']:.4f}
+- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
+# Part 5: Visualization Functions
+def create_feature_plots(features):
+    """Create interactive visualizations of voice features."""
+    try:
+        fig = go.Figure()
+        # Pitch visualization
+        pitch_data = {
+            'Mean': features['pitch_mean'],
+            'Std Dev': features['pitch_std'],
+            'Range': features['pitch_range']
+        }
+        fig.add_trace(go.Bar(
+            name='Pitch Features (Hz)',
+            x=list(pitch_data.keys()),
+            y=list(pitch_data.values()),
+            marker_color='blue'
+        ))
+        # Energy visualization
+        energy_data = {
+            'Mean': features['energy_mean'],
+            'Std Dev': features['energy_std'],
+            'Range': features['energy_range']
+        }
+        fig.add_trace(go.Bar(
+            name='Energy Features',
+            x=[f"Energy {k}" for k in energy_data.keys()],
+            y=list(energy_data.values()),
+            marker_color='red'
+        ))
+        # Tempo visualization
+        fig.add_trace(go.Scatter(
+            name='Speech Rate (BPM)',
+            x=['Tempo'],
+            y=[features['tempo']],
+            mode='markers',
+            marker=dict(size=15, color='green')
+        ))
+        # Layout configuration
+        fig.update_layout(
+            title='Voice Feature Analysis',
+            showlegend=True,
+            height=600,
+            barmode='group',
+            xaxis_title='Feature Type',
+            yaxis_title='Value',
+            template='plotly_white'
+        )
+        return fig.to_html(include_plotlyjs=True)
+    except Exception as e:
+        print(f"Plot creation error: {e}")
+        return None
+def create_emotion_plot(emotions):
+    """Create visualization of emotional analysis."""
+    try:
+        fig = go.Figure(data=[
+            go.Bar(
+                x=list(emotions.keys()),
+                y=list(emotions.values()),
+                marker_color=['#FF9999', '#66B2FF', '#99FF99',
+                            '#FFCC99', '#FF99CC', '#99FFFF']
+            )
+        ])
+        fig.update_layout(
+            title='Emotion Analysis',
+            xaxis_title='Emotion',
+            yaxis_title='Confidence Score',
+            yaxis_range=[0, 1],
+            template='plotly_white',
+            height=400
+        )
+        return fig.to_html(include_plotlyjs=True)
+    except Exception as e:
+        print(f"Emotion plot error: {e}")
+        return None
+# Part 6: Main Analysis Function
+def analyze_audio(audio_input):
+    """Process audio input and generate comprehensive analysis."""
+    try:
+        # Validate input
+        if audio_input is None:
+            return "Please provide an audio input", None, None
+        # Load audio
+        audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input
+        waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
+        # Validate duration
+        duration = len(waveform) / sr
+        if duration < 0.5:
+            return "Audio too short (minimum 0.5 seconds needed)", None, None
+        # Extract features
+        features = extract_prosodic_features(waveform, sr)
+        if features is None:
+            return "Feature extraction failed", None, None
+        # Generate visualizations
+        feature_viz = create_feature_plots(features)
+        # Perform speech recognition
+        inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
+        with torch.no_grad():
+            predicted_ids = whisper_model.generate(inputs)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        # Analyze emotions
+        emotion_inputs = emotion_tokenizer(
+            transcription,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        with torch.no_grad():
+            emotion_outputs = emotion_model(**emotion_inputs)
+        emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
+        # Process emotion scores
+        emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
+        emotion_scores = {
+            label: float(score)
+            for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
+        }
+        emotion_viz = create_emotion_plot(emotion_scores)
+        # Generate clinical analysis
+        global clinical_analyzer
+        if clinical_analyzer is None:
+            clinical_analyzer = ClinicalVoiceAnalyzer()
+        clinical_analysis = clinical_analyzer.analyze_voice_metrics(
+            features, emotion_scores, transcription
+        )
+        # Create comprehensive summary
+        summary = f"""Voice Analysis Summary:
+Speech Content:
+{transcription}
+Voice Characteristics:
+- Average Pitch: {features['pitch_mean']:.2f} Hz
+- Pitch Variation: {features['pitch_std']:.2f} Hz
+- Speech Rate (Tempo): {features['tempo']:.2f} BPM
+- Voice Energy: {features['energy_mean']:.4f}
+Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
+Emotion Confidence: {max(emotion_scores.values()):.2%}
+Recording Duration: {duration:.2f} seconds
+{clinical_analysis}"""
+        return summary, emotion_viz, feature_viz
+    except Exception as e:
+        error_msg = f"Analysis failed: {str(e)}"
+        print(error_msg)
+        return error_msg, None, None
+# Part 7: Application Initialization
 try:
     print("===== Application Startup =====")
+    # Load required models
     if not load_models():
+        raise RuntimeError("Model loading failed")
     # Initialize clinical analyzer
     clinical_analyzer = ClinicalVoiceAnalyzer()
             gr.HTML(label="Emotion Analysis"),
             gr.HTML(label="Voice Feature Analysis")
         ],
+        title="Voice Analysis System with Clinical Interpretation",
         description="""
+        This application provides comprehensive voice analysis with clinical insights:
+        1. Voice Features:
+           - Pitch analysis (fundamental frequency and variation)
+           - Energy patterns (volume and intensity)
+           - Speech rate (words per minute)
+           - Voice quality metrics
+        2. Clinical Analysis:
+           - Mental health indicators
+           - Emotional state evaluation
+           - Risk assessment
+           - Clinical recommendations
+        3. Emotional Content:
+           - Emotion detection (6 basic emotions)
+           - Emotional intensity analysis
+        For optimal description="""
+        This application provides comprehensive voice analysis with clinical insights:
         1. Voice Features:
            - Pitch analysis (fundamental frequency and variation)
         - Speak clearly and naturally
         - Keep recordings between 1-5 seconds
         - Maintain consistent volume
+        Upload an audio file or record directly through your microphone.
+        """,
+        examples=None,
+        cache_examples=False
     )
+    # Launch the interface
     if __name__ == "__main__":
         demo.launch()