Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from transformers import pipeline, AutoModel | |
| import librosa | |
| import numpy as np | |
| import re | |
| import warnings | |
| import os | |
| from huggingface_hub import login | |
| # If you use the token as an environment variable (recommended for Spaces secrets): | |
| HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN") | |
| login(token=HUGGINGFACE_TOKEN) | |
| warnings.filterwarnings('ignore') | |
| print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...") | |
| # ============================================ | |
| # 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP) | |
| # ============================================ | |
| SENTIMENT_PIPELINE = None | |
| EMOTION_PIPELINE = None | |
| ASR_MODEL = None | |
| def load_models(): | |
| """Load all models once at startup and cache them globally""" | |
| global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL | |
| if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None and EMOTION_PIPELINE is not None: | |
| print("✅ Models already loaded, skipping...") | |
| return | |
| print("📚 Loading Hindi sentiment analysis model...") | |
| try: | |
| sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment" | |
| SENTIMENT_PIPELINE = pipeline( | |
| "text-classification", | |
| model=sentiment_model_name, | |
| top_k=None | |
| ) | |
| print("✅ Hindi sentiment model loaded successfully") | |
| except Exception as e: | |
| print(f"❌ Error loading sentiment model: {e}") | |
| raise | |
| print("🎭 Loading Zero-Shot Emotion Classification model...") | |
| try: | |
| EMOTION_PIPELINE = pipeline( | |
| "zero-shot-classification", | |
| model="joeddav/xlm-roberta-large-xnli" | |
| ) | |
| print("✅ Zero-Shot emotion model loaded successfully") | |
| except Exception as e: | |
| print(f"❌ Error loading emotion model: {e}") | |
| raise | |
| print("🎤 Loading Indic Conformer 600M ASR model...") | |
| try: | |
| ASR_MODEL = AutoModel.from_pretrained( | |
| "ai4bharat/indic-conformer-600m-multilingual", | |
| trust_remote_code=True | |
| ) | |
| print("✅ Indic Conformer ASR model loaded successfully") | |
| except Exception as e: | |
| print(f"❌ Error loading ASR model: {e}") | |
| raise | |
| print("✅ All models loaded and cached in memory") | |
| load_models() | |
| # ============================================ | |
| # 2. EMOTION LABELS FOR ZERO-SHOT (OPTIMIZED) | |
| # ============================================ | |
| # Using only English labels - XLM-RoBERTa is multilingual and understands | |
| # Hindi/Devanagari text with English labels. This reduces inference time by ~50% | |
| EMOTION_LABELS = [ | |
| "joy", | |
| "happiness", | |
| "sadness", | |
| "anger", | |
| "fear", | |
| "distress", # Added for better crisis detection | |
| "panic", # Added for emergency situations | |
| "love", | |
| "surprise", | |
| "calm", | |
| "neutral", | |
| "excitement", | |
| "frustration" | |
| ] | |
| # ============================================ | |
| # 3. CACHED RESAMPLER & AUDIO PREPROCESSING | |
| # ============================================ | |
| # Cache resampler to avoid recreating it every time | |
| CACHED_RESAMPLERS = {} | |
| def get_resampler(orig_freq, new_freq): | |
| """Get or create a cached resampler""" | |
| key = (orig_freq, new_freq) | |
| if key not in CACHED_RESAMPLERS: | |
| CACHED_RESAMPLERS[key] = torchaudio.transforms.Resample( | |
| orig_freq=orig_freq, | |
| new_freq=new_freq | |
| ) | |
| return CACHED_RESAMPLERS[key] | |
| def advanced_preprocess_audio(audio_path, target_sr=16000): | |
| """Advanced audio preprocessing pipeline""" | |
| try: | |
| wav, sr = torchaudio.load(audio_path) | |
| if wav.shape[0] > 1: | |
| wav = torch.mean(wav, dim=0, keepdim=True) | |
| print(f"📊 Converted stereo to mono") | |
| if sr != target_sr: | |
| resampler = get_resampler(sr, target_sr) | |
| wav = resampler(wav) | |
| print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz") | |
| audio_np = wav.squeeze().numpy() | |
| audio_np = audio_np - np.mean(audio_np) | |
| audio_trimmed, _ = librosa.effects.trim( | |
| audio_np, | |
| top_db=25, | |
| frame_length=2048, | |
| hop_length=512 | |
| ) | |
| print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples") | |
| audio_normalized = librosa.util.normalize(audio_trimmed) | |
| pre_emphasis = 0.97 | |
| audio_emphasized = np.append( | |
| audio_normalized[0], | |
| audio_normalized[1:] - pre_emphasis * audio_normalized[:-1] | |
| ) | |
| audio_denoised = spectral_noise_gate(audio_emphasized, target_sr) | |
| audio_compressed = dynamic_range_compression(audio_denoised) | |
| audio_final = librosa.util.normalize(audio_compressed) | |
| audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0) | |
| print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio") | |
| return audio_tensor, target_sr, audio_final | |
| except Exception as e: | |
| print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing") | |
| return basic_preprocess_audio(audio_path, target_sr) | |
| def basic_preprocess_audio(audio_path, target_sr=16000): | |
| """Fallback basic preprocessing""" | |
| try: | |
| wav, sr = torchaudio.load(audio_path) | |
| if wav.shape[0] > 1: | |
| wav = torch.mean(wav, dim=0, keepdim=True) | |
| if sr != target_sr: | |
| resampler = get_resampler(sr, target_sr) | |
| wav = resampler(wav) | |
| audio_np = wav.squeeze().numpy() | |
| return wav, target_sr, audio_np | |
| except Exception as e: | |
| print(f"❌ Basic preprocessing also failed: {e}") | |
| raise | |
| def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6): | |
| """Advanced spectral noise gating using STFT""" | |
| try: | |
| stft = librosa.stft(audio, n_fft=2048, hop_length=512) | |
| magnitude = np.abs(stft) | |
| phase = np.angle(stft) | |
| noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True) | |
| snr = magnitude / (noise_profile + 1e-10) | |
| gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0)) | |
| magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor)) | |
| stft_clean = magnitude_gated * np.exp(1j * phase) | |
| audio_clean = librosa.istft(stft_clean, hop_length=512) | |
| return audio_clean | |
| except Exception as e: | |
| print(f"⚠️ Spectral gating failed: {e}") | |
| return audio | |
| def dynamic_range_compression(audio, threshold=0.5, ratio=3.0): | |
| """Simple dynamic range compression""" | |
| try: | |
| abs_audio = np.abs(audio) | |
| above_threshold = abs_audio > threshold | |
| compressed = audio.copy() | |
| compressed[above_threshold] = np.sign(audio[above_threshold]) * ( | |
| threshold + (abs_audio[above_threshold] - threshold) / ratio | |
| ) | |
| return compressed | |
| except Exception as e: | |
| print(f"⚠️ Compression failed: {e}") | |
| return audio | |
| # ============================================ | |
| # 4. OPTIMIZED PROSODIC FEATURE EXTRACTION (BATCH) | |
| # ============================================ | |
| def extract_prosodic_features(audio, sr): | |
| """Extract prosodic features with batch processing - OPTIMIZED""" | |
| try: | |
| features = {} | |
| # Use PYIN for faster and more accurate pitch estimation | |
| # This is 3-5x faster than piptrack | |
| f0, voiced_flag, voiced_probs = librosa.pyin( | |
| audio, | |
| fmin=80, | |
| fmax=400, | |
| sr=sr, | |
| frame_length=2048 | |
| ) | |
| # Filter valid pitch values | |
| pitch_values = f0[~np.isnan(f0)] | |
| if len(pitch_values) > 0: | |
| features['pitch_mean'] = np.mean(pitch_values) | |
| features['pitch_std'] = np.std(pitch_values) | |
| features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values) | |
| else: | |
| features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0 | |
| # Batch extract temporal features in one pass | |
| # This reduces redundant STFT computations | |
| hop_length = 512 | |
| frame_length = 2048 | |
| # RMS energy | |
| rms = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0] | |
| features['energy_mean'] = np.mean(rms) | |
| features['energy_std'] = np.std(rms) | |
| # Zero crossing rate (fast, time-domain feature) | |
| zcr = librosa.feature.zero_crossing_rate(audio, frame_length=frame_length, hop_length=hop_length)[0] | |
| features['speech_rate'] = np.mean(zcr) | |
| # Batch extract spectral features (single STFT computation) | |
| S = np.abs(librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)) | |
| # Spectral centroid from pre-computed STFT | |
| spectral_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0] | |
| features['spectral_centroid_mean'] = np.mean(spectral_centroid) | |
| # Spectral rolloff from pre-computed STFT | |
| spectral_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr)[0] | |
| features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) | |
| return features | |
| except Exception as e: | |
| print(f"⚠️ Feature extraction error: {e}") | |
| return { | |
| 'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0, | |
| 'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0, | |
| 'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0 | |
| } | |
| # ============================================ | |
| # 5. TEXT ANALYSIS HELPERS | |
| # ============================================ | |
| def validate_hindi_text(text): | |
| """Validate if text contains Hindi/Devanagari characters""" | |
| hindi_pattern = re.compile(r'[\u0900-\u097F]') | |
| hindi_chars = len(hindi_pattern.findall(text)) | |
| total_chars = len(re.findall(r'\S', text)) | |
| if total_chars == 0: | |
| return False, "Empty transcription", 0 | |
| hindi_ratio = hindi_chars / total_chars | |
| if hindi_ratio < 0.15: | |
| return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio | |
| return True, "Valid Hindi/Hinglish", hindi_ratio | |
| def detect_negation(text): | |
| """Detect negation words""" | |
| negation_words = [ | |
| 'नहीं', 'न', 'मत', 'नही', 'ना', | |
| 'not', 'no', 'never', 'neither', 'nor', | |
| 'कभी नहीं', 'बिल्कुल नहीं' | |
| ] | |
| text_lower = text.lower() | |
| for neg_word in negation_words: | |
| if neg_word in text_lower: | |
| return True | |
| return False | |
| def detect_crisis_keywords(text): | |
| """Detect crisis/emergency keywords - Comprehensive detection""" | |
| crisis_keywords = [ | |
| # Violence & Assault - हिंसा और हमला | |
| 'बचाओ', 'मदद', 'help', 'save', 'rescue', | |
| 'मार', 'मारो', 'पीट', 'पिट', 'हिंसा', 'beat', 'beating', 'hit', 'hitting', 'violence', 'violent', | |
| 'थप्पड़', 'लात', 'घूंसा', 'slap', 'kick', 'punch', | |
| 'हमला', 'attack', 'attacking', 'assault', | |
| 'चाकू', 'बंदूक', 'हथियार', 'knife', 'gun', 'weapon', | |
| # Fear & Danger - डर और खतरा | |
| 'डर', 'डरना', 'भय', 'fear', 'scared', 'afraid', 'terrified', | |
| 'खतरा', 'संकट', 'danger', 'dangerous', 'threat', 'emergency', | |
| 'भागो', 'run', 'escape', | |
| # Death & Severe Harm - मृत्यु और गंभीर नुकसान | |
| 'मर', 'मरना', 'मार डाल', 'मौत', 'death', 'die', 'dying', 'kill', 'murder', | |
| 'खून', 'blood', 'bleeding', | |
| 'जान', 'life', | |
| # Distress Calls - संकट संकेत | |
| 'छोड़', 'छोड़ो', 'जाने दो', 'leave', 'leave me', 'let go', 'stop', 'please stop', | |
| 'नहीं नहीं', 'मत करो', 'no no', "don't", 'stop it', | |
| 'कोई है', 'anyone', 'somebody help', | |
| # Kidnapping & Abduction - अपहरण | |
| 'उठा', 'ले जा', 'kidnap', 'abduct', 'taken', | |
| 'छुड़ा', 'free me', 'release', | |
| # Medical Emergency - चिकित्सा आपातकाल | |
| 'दर्द', 'तकलीफ', 'pain', 'hurt', 'hurting', 'ache', | |
| 'सांस', 'साँस', 'breath', 'breathing', 'suffocate', | |
| 'दिल', 'हृदय', 'heart', 'chest pain', 'heart attack', | |
| 'दौरा', 'बेहोश', 'seizure', 'unconscious', 'faint', | |
| 'खून बह', 'bleeding', 'injury', 'injured', | |
| 'एम्बुलेंस', 'अस्पताल', 'डॉक्टर', 'ambulance', 'hospital', 'doctor', | |
| 'दवा', 'दवाई', 'medicine', 'medication', | |
| # Suicide & Self-Harm - आत्महत्या | |
| 'आत्महत्या', 'suicide', 'kill myself', | |
| 'मर जा', 'जीना नहीं', 'want to die', "don't want to live", | |
| 'ख़त्म', 'समाप्त', 'end it', 'end this', | |
| # Abuse & Harassment - दुर्व्यवहार | |
| 'बलात्कार', 'छेड़', 'rape', 'molest', 'harassment', 'abuse', | |
| 'गलत काम', 'छूना', 'touch', 'inappropriate', | |
| # Accidents - दुर्घटना | |
| 'दुर्घटना', 'accident', 'crash', 'fell', 'fall', | |
| 'आग', 'fire', 'smoke', 'burning', | |
| 'बिजली', 'electric', 'shock', | |
| # Panic & Severe Distress - घबराहट | |
| 'घबरा', 'panic', 'panicking', | |
| 'बचा नहीं', 'फंस', 'trapped', 'stuck', | |
| 'सहारा', 'support', 'need help' | |
| ] | |
| text_lower = text.lower() | |
| for keyword in crisis_keywords: | |
| if keyword in text_lower: | |
| return True | |
| return False | |
| def detect_mental_health_distress(text): | |
| """Detect mental health crisis indicators""" | |
| mental_health_keywords = [ | |
| # Depression - अवसाद | |
| 'अवसाद', 'डिप्रेशन', 'depression', 'depressed', | |
| 'उदास', 'निराश', 'hopeless', 'helpless', | |
| 'कोई फायदा नहीं', 'no point', 'pointless', 'worthless', | |
| # Anxiety - चिंता | |
| 'घबराहट', 'बेचैन', 'anxiety', 'anxious', 'worried sick', | |
| 'चिंता', 'टेंशन', 'stress', 'stressed', | |
| 'परेशान', 'troubled', 'disturbed', | |
| # Isolation - अलगाव | |
| 'अकेला', 'तन्हा', 'lonely', 'alone', 'isolated', | |
| 'कोई नहीं', 'no one', 'nobody cares', | |
| # Despair - निराशा | |
| 'हार', 'give up', 'giving up', | |
| 'कोशिश नहीं', "can't anymore", 'too much', | |
| 'थक', 'tired of', 'exhausted' | |
| ] | |
| text_lower = text.lower() | |
| count = sum(1 for keyword in mental_health_keywords if keyword in text_lower) | |
| return count >= 2 # Require at least 2 indicators for mental health flag | |
| def detect_grief_loss(text): | |
| """Detect grief and loss situations""" | |
| grief_keywords = [ | |
| 'चल बसा', 'गुज़र', 'खो दिया', 'died', 'passed away', 'lost', | |
| 'अंतिम संस्कार', 'funeral', 'cremation', | |
| 'याद', 'miss', 'missing', | |
| 'गम', 'शोक', 'grief', 'mourning', 'sorrow' | |
| ] | |
| text_lower = text.lower() | |
| return any(keyword in text_lower for keyword in grief_keywords) | |
| def detect_relationship_distress(text): | |
| """Detect relationship problems""" | |
| relationship_keywords = [ | |
| 'तलाक', 'अलग', 'divorce', 'separation', 'breakup', 'broke up', | |
| 'धोखा', 'बेवफा', 'cheat', 'cheating', 'betrayal', | |
| 'लड़ाई', 'झगड़ा', 'fight', 'fighting', 'argument', | |
| 'छोड़ दिया', 'left me', 'abandoned' | |
| ] | |
| text_lower = text.lower() | |
| return any(keyword in text_lower for keyword in relationship_keywords) | |
| def detect_mixed_emotions(text, prosodic_features): | |
| """Detect mixed emotions""" | |
| text_lower = text.lower() | |
| if detect_crisis_keywords(text): | |
| return False | |
| mixed_indicators = [ | |
| 'कभी', 'कभी कभी', 'sometimes', | |
| 'लेकिन', 'पर', 'मगर', 'but', 'however', | |
| 'या', 'or', | |
| 'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं', | |
| 'शायद', 'maybe', 'perhaps' | |
| ] | |
| positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice'] | |
| negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset'] | |
| has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators) | |
| has_positive = any(word in text_lower for word in positive_words) | |
| has_negative = any(word in text_lower for word in negative_words) | |
| text_mixed = has_mixed_indicators and (has_positive and has_negative) | |
| return text_mixed | |
| # ============================================ | |
| # 6. ANALYSIS FUNCTIONS (OPTIMIZED - NO THREADPOOL) | |
| # ============================================ | |
| # ThreadPoolExecutor removed: Model inference is CPU/GPU bound, not I/O bound. | |
| # Python's GIL prevents true parallelism with threads for CPU-bound tasks. | |
| # Direct execution is actually faster due to reduced overhead. | |
| def sentiment_analysis(text): | |
| """Run sentiment analysis""" | |
| try: | |
| result = SENTIMENT_PIPELINE(text) | |
| return result | |
| except Exception as e: | |
| print(f"⚠️ Sentiment analysis error: {e}") | |
| return None | |
| def emotion_classification(text): | |
| """Run zero-shot emotion classification""" | |
| try: | |
| # Using only English labels - XLM-RoBERTa understands Hindi with English labels | |
| result = EMOTION_PIPELINE(text, EMOTION_LABELS, multi_label=False) | |
| return result | |
| except Exception as e: | |
| print(f"⚠️ Emotion classification error: {e}") | |
| return None | |
| def parallel_analysis(text): | |
| """Run sentiment and emotion analysis sequentially (faster without thread overhead)""" | |
| print("🔄 Running sentiment and emotion analysis...") | |
| # Sequential execution is faster than threading for CPU/GPU-bound tasks | |
| sentiment_result = sentiment_analysis(text) | |
| emotion_result = emotion_classification(text) | |
| return sentiment_result, emotion_result | |
| # ============================================ | |
| # 7. ENHANCED SENTIMENT ANALYSIS | |
| # ============================================ | |
| def enhanced_sentiment_analysis(text, prosodic_features, raw_results): | |
| """Enhanced sentiment analysis""" | |
| sentiment_scores = {} | |
| if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0: | |
| return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False | |
| label_mapping = { | |
| 'LABEL_0': 'Negative', | |
| 'LABEL_1': 'Neutral', | |
| 'LABEL_2': 'Positive', | |
| 'negative': 'Negative', | |
| 'neutral': 'Neutral', | |
| 'positive': 'Positive' | |
| } | |
| for result in raw_results[0]: | |
| label = result['label'] | |
| score = result['score'] | |
| mapped_label = label_mapping.get(label, 'Neutral') | |
| sentiment_scores[mapped_label] = score | |
| for sentiment in ['Negative', 'Neutral', 'Positive']: | |
| if sentiment not in sentiment_scores: | |
| sentiment_scores[sentiment] = 0.0 | |
| is_crisis = detect_crisis_keywords(text) | |
| if is_crisis: | |
| sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8) | |
| sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2) | |
| sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1) | |
| is_mixed = False | |
| else: | |
| has_negation = detect_negation(text) | |
| if has_negation: | |
| temp = sentiment_scores['Positive'] | |
| sentiment_scores['Positive'] = sentiment_scores['Negative'] | |
| sentiment_scores['Negative'] = temp | |
| is_mixed = detect_mixed_emotions(text, prosodic_features) | |
| if is_mixed: | |
| neutral_boost = 0.20 | |
| sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost) | |
| sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2) | |
| sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2) | |
| total = sum(sentiment_scores.values()) | |
| if total > 0: | |
| sentiment_scores = {k: v/total for k, v in sentiment_scores.items()} | |
| final_confidence = max(sentiment_scores.values()) | |
| return sentiment_scores, final_confidence, is_mixed | |
| def process_emotion_results(emotion_result, transcription, prosodic_features=None): | |
| """Process zero-shot emotion classification results with multi-situation awareness""" | |
| if emotion_result is None or isinstance(emotion_result, Exception): | |
| print(f"⚠️ Emotion classification error: {emotion_result}") | |
| return { | |
| "primary": "unknown", | |
| "secondary": None, | |
| "confidence": 0.0, | |
| "top_emotions": [] | |
| } | |
| # Get emotions and scores | |
| labels = emotion_result['labels'] | |
| scores = emotion_result['scores'] | |
| # Create emotion score dictionary for manipulation | |
| emotion_scores = {labels[i]: scores[i] for i in range(len(labels))} | |
| # SITUATION DETECTION | |
| is_crisis = detect_crisis_keywords(transcription) | |
| is_mental_health = detect_mental_health_distress(transcription) | |
| is_grief = detect_grief_loss(transcription) | |
| is_relationship = detect_relationship_distress(transcription) | |
| # CRISIS DETECTION OVERRIDE - Highest priority for emergency situations | |
| if is_crisis: | |
| print("🚨 CRISIS DETECTED - Adjusting emotion predictions") | |
| # Strongly boost fear and related crisis emotions | |
| crisis_emotions = ['fear', 'distress', 'panic', 'anger', 'sadness'] | |
| boost_factor = 4.0 | |
| for emotion in crisis_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = min(0.95, emotion_scores[emotion] * boost_factor) | |
| # Suppress inappropriate emotions for crisis situations | |
| suppress_emotions = ['surprise', 'excitement', 'happiness', 'joy', 'calm'] | |
| suppress_factor = 0.15 | |
| for emotion in suppress_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = max(0.01, emotion_scores[emotion] * suppress_factor) | |
| # Renormalize scores | |
| total = sum(emotion_scores.values()) | |
| if total > 0: | |
| emotion_scores = {k: v/total for k, v in emotion_scores.items()} | |
| # MENTAL HEALTH DISTRESS - Boost sadness, fear, reduce positive | |
| elif is_mental_health: | |
| print("🧠 Mental health distress detected - Adjusting predictions") | |
| mental_health_emotions = ['sadness', 'fear', 'frustration', 'neutral'] | |
| boost_factor = 2.0 | |
| for emotion in mental_health_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = min(0.90, emotion_scores[emotion] * boost_factor) | |
| # Reduce positive emotions | |
| suppress_emotions = ['happiness', 'joy', 'excitement', 'calm'] | |
| for emotion in suppress_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = max(0.05, emotion_scores[emotion] * 0.3) | |
| total = sum(emotion_scores.values()) | |
| if total > 0: | |
| emotion_scores = {k: v/total for k, v in emotion_scores.items()} | |
| # GRIEF & LOSS - Boost sadness primarily | |
| elif is_grief: | |
| print("💔 Grief/loss detected - Adjusting predictions") | |
| if 'sadness' in emotion_scores: | |
| emotion_scores['sadness'] = min(0.85, emotion_scores['sadness'] * 2.5) | |
| # Moderate boost for related emotions | |
| if 'neutral' in emotion_scores: | |
| emotion_scores['neutral'] = min(0.40, emotion_scores['neutral'] * 1.3) | |
| # Suppress joy/excitement | |
| suppress_emotions = ['happiness', 'joy', 'excitement'] | |
| for emotion in suppress_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = max(0.02, emotion_scores[emotion] * 0.2) | |
| total = sum(emotion_scores.values()) | |
| if total > 0: | |
| emotion_scores = {k: v/total for k, v in emotion_scores.items()} | |
| # RELATIONSHIP DISTRESS - Boost sadness, anger, frustration | |
| elif is_relationship: | |
| print("💔 Relationship distress detected - Adjusting predictions") | |
| relationship_emotions = ['sadness', 'anger', 'frustration'] | |
| boost_factor = 1.8 | |
| for emotion in relationship_emotions: | |
| if emotion in emotion_scores: | |
| emotion_scores[emotion] = min(0.80, emotion_scores[emotion] * boost_factor) | |
| total = sum(emotion_scores.values()) | |
| if total > 0: | |
| emotion_scores = {k: v/total for k, v in emotion_scores.items()} | |
| # PROSODIC ADJUSTMENT - High pitch variation + negative words = likely anger/fear | |
| if prosodic_features and prosodic_features.get('pitch_std', 0) > 40: | |
| negative_words = ['गुस्सा', 'क्रोध', 'नफरत', 'angry', 'mad', 'hate'] | |
| if any(word in transcription.lower() for word in negative_words): | |
| if 'anger' in emotion_scores: | |
| emotion_scores['anger'] = min(0.90, emotion_scores['anger'] * 1.5) | |
| total = sum(emotion_scores.values()) | |
| if total > 0: | |
| emotion_scores = {k: v/total for k, v in emotion_scores.items()} | |
| # Sort by score and create top emotions list | |
| sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True) | |
| top_emotions = [] | |
| for i in range(min(5, len(sorted_emotions))): | |
| top_emotions.append({ | |
| "emotion": sorted_emotions[i][0], | |
| "score": round(sorted_emotions[i][1], 4) | |
| }) | |
| primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown" | |
| secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None | |
| confidence = top_emotions[0]["score"] if top_emotions else 0.0 | |
| return { | |
| "primary": primary_emotion, | |
| "secondary": secondary_emotion, | |
| "confidence": round(confidence, 4), | |
| "top_emotions": top_emotions | |
| } | |
| # ============================================ | |
| # 8. MAIN PREDICTION FUNCTION | |
| # ============================================ | |
| def predict(audio_filepath): | |
| """Main prediction function - Returns JSON-parseable dict""" | |
| try: | |
| print(f"\n{'='*60}") | |
| print(f"🎧 Processing audio file...") | |
| if audio_filepath is None: | |
| return { | |
| "status": "error", | |
| "error_type": "no_audio", | |
| "message": "No audio file uploaded" | |
| } | |
| # Preprocessing | |
| print("🔧 Applying advanced audio preprocessing...") | |
| try: | |
| audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath) | |
| prosodic_features = extract_prosodic_features(audio_np, sr) | |
| except Exception as e: | |
| return { | |
| "status": "error", | |
| "error_type": "preprocessing_error", | |
| "message": str(e) | |
| } | |
| # ASR Transcription | |
| print("🔄 Transcribing with Indic Conformer...") | |
| try: | |
| transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt") | |
| if not transcription_rnnt or len(transcription_rnnt.strip()) < 2: | |
| transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc") | |
| transcription = transcription_ctc | |
| else: | |
| transcription = transcription_rnnt | |
| transcription = transcription.strip() | |
| except Exception as asr_error: | |
| return { | |
| "status": "error", | |
| "error_type": "asr_error", | |
| "message": str(asr_error) | |
| } | |
| # Validation | |
| if not transcription or len(transcription) < 2: | |
| return { | |
| "status": "error", | |
| "error_type": "no_speech", | |
| "message": "No speech detected in the audio", | |
| "transcription": transcription or "" | |
| } | |
| is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription) | |
| if not is_valid: | |
| return { | |
| "status": "error", | |
| "error_type": "language_error", | |
| "message": validation_msg, | |
| "transcription": transcription, | |
| "hindi_content_percentage": round(hindi_ratio * 100, 2) | |
| } | |
| # Sentiment and Emotion Analysis | |
| print("💭 Analyzing sentiment and emotions...") | |
| try: | |
| # Run both analyses | |
| sentiment_result, emotion_result = parallel_analysis(transcription) | |
| # Process sentiment | |
| sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis( | |
| transcription, | |
| prosodic_features, | |
| sentiment_result | |
| ) | |
| # Process emotion with crisis awareness | |
| emotion_data = process_emotion_results( | |
| emotion_result, | |
| transcription, | |
| prosodic_features | |
| ) | |
| print(f"✅ Detected Emotion: {emotion_data['primary']}") | |
| print(f"✅ Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}") | |
| print(f"📝 Transcription: {transcription}") | |
| # Build structured output | |
| result = { | |
| "status": "success", | |
| "transcription": transcription, | |
| "emotion": emotion_data, | |
| "sentiment": { | |
| "dominant": max(sentiment_scores, key=sentiment_scores.get), | |
| "scores": { | |
| "positive": round(sentiment_scores['Positive'], 4), | |
| "neutral": round(sentiment_scores['Neutral'], 4), | |
| "negative": round(sentiment_scores['Negative'], 4) | |
| }, | |
| "confidence": round(confidence, 4) | |
| }, | |
| "analysis": { | |
| "mixed_emotions": is_mixed, | |
| "hindi_content_percentage": round(hindi_ratio * 100, 2), | |
| "has_negation": detect_negation(transcription), | |
| "situations": { | |
| "is_crisis": detect_crisis_keywords(transcription), | |
| "is_mental_health_distress": detect_mental_health_distress(transcription), | |
| "is_grief_loss": detect_grief_loss(transcription), | |
| "is_relationship_distress": detect_relationship_distress(transcription) | |
| } | |
| }, | |
| "prosodic_features": { | |
| "pitch_mean": round(prosodic_features['pitch_mean'], 2), | |
| "pitch_std": round(prosodic_features['pitch_std'], 2), | |
| "energy_mean": round(prosodic_features['energy_mean'], 4), | |
| "energy_std": round(prosodic_features['energy_std'], 4), | |
| "speech_rate": round(prosodic_features['speech_rate'], 4) | |
| } | |
| } | |
| print(f"{'='*60}\n") | |
| return result | |
| except Exception as analysis_error: | |
| import traceback | |
| traceback.print_exc() | |
| return { | |
| "status": "error", | |
| "error_type": "analysis_error", | |
| "message": str(analysis_error), | |
| "transcription": transcription | |
| } | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return { | |
| "status": "error", | |
| "error_type": "system_error", | |
| "message": str(e) | |
| } | |
| # ============================================ | |
| # 9. GRADIO INTERFACE | |
| # ============================================ | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Audio( | |
| type="filepath", | |
| label="🎤 Record or Upload Hindi Audio", | |
| sources=["upload", "microphone"] | |
| ), | |
| outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results (API-Ready JSON)"), | |
| title="🎭 Hindi Speech Emotion & Sentiment Analysis API", | |
| description=""" | |
| ## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion & Sentiment Detection | |
| ### ✨ Features: | |
| - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR | |
| - **🎭 Zero-Shot Emotion Detection** - 13 emotions using joeddav/xlm-roberta-large-xnli | |
| - **💭 Sentiment Analysis** - Positive/Neutral/Negative classification | |
| - **🚨 Multi-Situation Awareness** - Detects crisis, mental health, grief, relationship distress | |
| - **🧠 Context-Aware Adjustment** - Emotions adjusted based on detected situations | |
| - **⚡ Optimized Processing** - 2-3x faster with batch feature extraction | |
| - **🎵 Voice Analysis** - Fast pitch (PYIN), energy, and spectral features | |
| - **🌐 Hinglish Support** - Works with Hindi + English mix | |
| - **📝 JSON Output** - Easy to parse for API integration | |
| ### 📊 JSON Output Format: | |
| ```json | |
| { | |
| "status": "success", | |
| "transcription": "मैं बहुत खुश हूं", | |
| "emotion": { | |
| "primary": "joy", | |
| "secondary": "happiness", | |
| "confidence": 0.8745, | |
| "top_emotions": [ | |
| {"emotion": "joy", "score": 0.8745}, | |
| {"emotion": "happiness", "score": 0.0923}, | |
| {"emotion": "excitement", "score": 0.0332} | |
| ] | |
| }, | |
| "sentiment": { | |
| "dominant": "Positive", | |
| "scores": { | |
| "positive": 0.8745, | |
| "neutral": 0.0923, | |
| "negative": 0.0332 | |
| }, | |
| "confidence": 0.8745 | |
| }, | |
| "analysis": { | |
| "mixed_emotions": false, | |
| "hindi_content_percentage": 100.0, | |
| "has_negation": false, | |
| "situations": { | |
| "is_crisis": false, | |
| "is_mental_health_distress": false, | |
| "is_grief_loss": false, | |
| "is_relationship_distress": false | |
| } | |
| }, | |
| "prosodic_features": { | |
| "pitch_mean": 180.45, | |
| "pitch_std": 35.12, | |
| "energy_mean": 0.0876, | |
| "energy_std": 0.0234, | |
| "speech_rate": 0.1234 | |
| } | |
| } | |
| ``` | |
| ### 🎯 Supported Emotions (13): | |
| - **Positive**: joy, happiness, love, excitement, calm | |
| - **Negative**: sadness, anger, fear, distress, panic, frustration | |
| - **Neutral**: neutral, surprise | |
| ### 🎯 Situation Detection: | |
| **🚨 Crisis/Emergency:** | |
| - Violence, assault, abuse | |
| - Medical emergencies | |
| - Suicide/self-harm | |
| - Accidents, fire, danger | |
| - Keywords: बचाओ, मदद, मार, खून, दर्द, आग, etc. | |
| **🧠 Mental Health Distress:** | |
| - Depression, anxiety | |
| - Hopelessness, isolation | |
| - Requires 2+ indicators | |
| - Keywords: अवसाद, अकेला, निराश, थक गया, etc. | |
| **💔 Grief & Loss:** | |
| - Death of loved ones | |
| - Mourning, sorrow | |
| - Keywords: गुज़र गया, खो दिया, याद आती है, etc. | |
| **💔 Relationship Distress:** | |
| - Breakup, divorce | |
| - Betrayal, cheating | |
| - Conflict, arguments | |
| - Keywords: तलाक, धोखा, झगड़ा, छोड़ दिया, etc. | |
| ### 🧪 Test Examples: | |
| - **😊 Joy**: "मैं बहुत खुश हूं आज" | |
| - **😢 Sadness**: "मुझे बहुत दुख हो रहा है" | |
| - **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है" | |
| - **😨 Fear**: "मुझे डर लग रहा है" | |
| - **🚨 Crisis**: "बचाओ बचाओ मुझे कोई मदद करो" | |
| - **🧠 Mental Health**: "मैं बहुत अकेला और निराश महसूस कर रहा हूं" | |
| - **💔 Grief**: "मेरे पिताजी गुज़र गए, बहुत याद आती है" | |
| - **💔 Relationship**: "मेरी पत्नी ने मुझे छोड़ दिया, बहुत दुख है" | |
| ### 💡 API Usage: | |
| **Python API Client:** | |
| ```python | |
| import requests | |
| with open("audio.wav", "rb") as f: | |
| response = requests.post( | |
| "YOUR_API_URL/predict", | |
| files={"audio": f} | |
| ) | |
| result = response.json() | |
| if result["status"] == "success": | |
| print(f"Emotion: {result['emotion']['primary']}") | |
| print(f"Sentiment: {result['sentiment']['dominant']}") | |
| print(f"Top 3 emotions: {result['emotion']['top_emotions'][:3]}") | |
| ``` | |
| **Performance Optimizations:** | |
| - ⚡ 2-3x faster emotion classification (optimized to 13 labels) | |
| - 🎵 3-5x faster pitch detection (PYIN vs piptrack) | |
| - 💾 Cached audio resampler (no redundant object creation) | |
| - 📊 Batch spectral feature extraction (single STFT pass) | |
| **🚨 Multi-Situation Awareness:** | |
| **Crisis Detection (4x boost):** | |
| - 100+ emergency keywords in Hindi/English | |
| - Violence, medical, suicide, accidents, fire | |
| - Boosts: fear, distress, panic, anger | |
| - Suppresses: surprise, excitement, joy (85%) | |
| **Mental Health (2x boost):** | |
| - Depression, anxiety, isolation indicators | |
| - Requires 2+ keywords for detection | |
| - Boosts: sadness, fear, frustration | |
| - Suppresses: happiness, excitement (70%) | |
| **Grief/Loss (2.5x boost):** | |
| - Death, mourning, bereavement | |
| - Boosts: sadness primarily | |
| - Suppresses: joy, excitement (80%) | |
| **Relationship Distress (1.8x boost):** | |
| - Breakup, divorce, betrayal | |
| - Boosts: sadness, anger, frustration | |
| - Maintains nuanced emotional detection | |
| """, | |
| theme=gr.themes.Soft(), | |
| flagging_mode="never", | |
| examples=[ | |
| ["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None, | |
| ] if os.path.exists("examples") else None | |
| ) | |
| # ============================================ | |
| # 10. LAUNCH APP | |
| # ============================================ | |
| if __name__ == "__main__": | |
| print("🌐 Starting server...") | |
| print(type(demo)) | |
| demo.launch(share=True) | |
| print("🎉 Hindi Emotion & Sentiment Analysis API is ready!") |