invincible-jha commited on
Commit
f7af1db
·
verified ·
1 Parent(s): 1cd7ce8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -143
app.py CHANGED
@@ -4,123 +4,73 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
 
 
 
7
 
8
- class ModelManager:
9
- def __init__(self):
10
- self.device = torch.device("cpu")
11
- self.models = {}
12
- self.tokenizers = {}
13
- self.processors = {}
14
- self.load_models()
15
-
16
- def load_models(self):
17
- try:
18
- print("Loading Whisper model...")
19
- self.processors['whisper'] = WhisperProcessor.from_pretrained(
20
- "openai/whisper-base" # Removed device_map parameter
21
- )
22
- self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained(
23
- "openai/whisper-base" # Removed device_map parameter
24
- ).to(self.device)
25
-
26
- print("Loading emotion model...")
27
- self.tokenizers['emotion'] = AutoTokenizer.from_pretrained(
28
- "j-hartmann/emotion-english-distilroberta-base"
29
- )
30
- self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained(
31
- "j-hartmann/emotion-english-distilroberta-base" # Removed device_map parameter
32
- ).to(self.device)
33
-
34
- print("Models loaded successfully")
35
- except Exception as e:
36
- print(f"Error loading models: {str(e)}")
37
- raise
38
 
39
- class AudioProcessor:
40
- def __init__(self):
41
- self.sample_rate = 16000
42
- self.n_mfcc = 13
43
-
44
- def process_audio(self, audio_path):
45
- try:
46
- waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
47
- return waveform, self._extract_features(waveform)
48
- except Exception as e:
49
- print(f"Error processing audio: {str(e)}")
50
- raise
51
-
52
- def _extract_features(self, waveform):
53
- try:
54
- return {
55
- 'mfcc': librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc),
56
- 'energy': librosa.feature.rms(y=waveform)[0]
57
- }
58
- except Exception as e:
59
- print(f"Error extracting features: {str(e)}")
60
- raise
61
 
62
- class Analyzer:
63
- def __init__(self):
64
- print("Initializing Analyzer...")
65
- try:
66
- self.model_manager = ModelManager()
67
- self.audio_processor = AudioProcessor()
68
- print("Analyzer initialization complete")
69
- except Exception as e:
70
- print(f"Error initializing Analyzer: {str(e)}")
71
- raise
72
-
73
- def analyze(self, audio_path):
74
- try:
75
- print(f"Processing audio file: {audio_path}")
76
- waveform, features = self.audio_processor.process_audio(audio_path)
77
-
78
- print("Transcribing audio...")
79
- inputs = self.model_manager.processors['whisper'](
80
- waveform,
81
- return_tensors="pt"
82
- ).input_features.to(self.model_manager.device)
83
-
84
- with torch.no_grad():
85
- predicted_ids = self.model_manager.models['whisper'].generate(inputs)
86
- transcription = self.model_manager.processors['whisper'].batch_decode(
87
- predicted_ids,
88
- skip_special_tokens=True
89
- )[0]
90
-
91
- print("Analyzing emotions...")
92
- inputs = self.model_manager.tokenizers['emotion'](
93
- transcription,
94
- return_tensors="pt",
95
- padding=True,
96
- truncation=True,
97
- max_length=512
98
- )
99
- inputs = {k: v.to(self.model_manager.device) for k, v in inputs.items()}
100
-
101
- with torch.no_grad():
102
- outputs = self.model_manager.models['emotion'](**inputs)
103
- emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
104
 
105
- emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
106
- emotion_scores = {
107
- label: float(score)
108
- for label, score in zip(emotion_labels, emotions[0].cpu())
109
- }
110
 
111
- return {
112
- 'transcription': transcription,
113
- 'emotions': emotion_scores
114
- }
115
- except Exception as e:
116
- print(f"Error in analysis: {str(e)}")
117
- raise
 
 
118
 
119
  def create_emotion_plot(emotions):
 
120
  try:
121
  fig = go.Figure(data=[
122
  go.Bar(
123
- x=list(emotions.keys()),
124
  y=list(emotions.values()),
125
  marker_color='rgb(55, 83, 109)'
126
  )
@@ -140,48 +90,105 @@ def create_emotion_plot(emotions):
140
  print(f"Error creating plot: {str(e)}")
141
  return "Error creating visualization"
142
 
143
- def process_audio(audio_file):
 
144
  try:
145
- if audio_file is None:
 
146
  return "No audio file provided", "Please provide an audio file"
147
-
148
- print(f"Processing audio file: {audio_file}")
149
- results = analyzer.analyze(audio_file)
150
 
151
- return (
152
- results['transcription'],
153
- create_emotion_plot(results['emotions'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
- error_msg = f"Error processing audio: {str(e)}"
157
  print(error_msg)
158
  return error_msg, "Error in analysis"
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  if __name__ == "__main__":
161
- print("Initializing application...")
162
- try:
163
- analyzer = Analyzer()
164
-
165
- print("Creating Gradio interface...")
166
- interface = gr.Interface(
167
- fn=process_audio,
168
- inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
169
- outputs=[
170
- gr.Textbox(label="Transcription"),
171
- gr.HTML(label="Emotion Analysis")
172
- ],
173
- title="Vocal Biomarker Analysis",
174
- description="Analyze voice for emotional indicators",
175
- examples=[],
176
- cache_examples=False
177
- )
178
-
179
- print("Launching application...")
180
- interface.launch(
181
- server_name="0.0.0.0",
182
- server_port=7860,
183
- share=False
184
- )
185
- except Exception as e:
186
- print(f"Fatal error during application startup: {str(e)}")
187
- raise
 
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
7
+ import warnings
8
+ import os
9
+ warnings.filterwarnings('ignore')
10
 
11
+ # Global variables for models
12
+ processor = None
13
+ whisper_model = None
14
+ emotion_tokenizer = None
15
+ emotion_model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def load_models():
18
+ """Initialize and load all required models"""
19
+ global processor, whisper_model, emotion_tokenizer, emotion_model
20
+
21
+ try:
22
+ print("Loading Whisper model...")
23
+ processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
24
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
25
+
26
+ print("Loading emotion model...")
27
+ emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
28
+ emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
29
+
30
+ # Move models to CPU explicitly
31
+ whisper_model.to("cpu")
32
+ emotion_model.to("cpu")
33
+
34
+ print("Models loaded successfully!")
35
+ return True
36
+ except Exception as e:
37
+ print(f"Error loading models: {str(e)}")
38
+ return False
39
 
40
+ def process_audio(audio_input):
41
+ """Process audio file and extract waveform"""
42
+ try:
43
+ print(f"Audio input received: {type(audio_input)}")
44
+
45
+ # Handle tuple input from Gradio
46
+ if isinstance(audio_input, tuple):
47
+ print(f"Audio input is tuple: {audio_input[0]}, {audio_input[1]}")
48
+ audio_path = audio_input[0] # Get the file path
49
+ else:
50
+ audio_path = audio_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ print(f"Processing audio from path: {audio_path}")
53
+
54
+ # Verify file exists
55
+ if not os.path.exists(audio_path):
56
+ raise FileNotFoundError(f"Audio file not found at {audio_path}")
57
 
58
+ # Load and resample audio
59
+ print("Loading audio file with librosa...")
60
+ waveform, sr = librosa.load(audio_path, sr=16000)
61
+ print(f"Audio loaded successfully. Shape: {waveform.shape}, SR: {sr}")
62
+
63
+ return waveform
64
+ except Exception as e:
65
+ print(f"Error processing audio: {str(e)}")
66
+ raise
67
 
68
  def create_emotion_plot(emotions):
69
+ """Create plotly visualization for emotion scores"""
70
  try:
71
  fig = go.Figure(data=[
72
  go.Bar(
73
+ x=list(emotions.keys()),
74
  y=list(emotions.values()),
75
  marker_color='rgb(55, 83, 109)'
76
  )
 
90
  print(f"Error creating plot: {str(e)}")
91
  return "Error creating visualization"
92
 
93
+ def analyze_audio(audio_input):
94
+ """Main function to analyze audio input"""
95
  try:
96
+ if audio_input is None:
97
+ print("No audio input provided")
98
  return "No audio file provided", "Please provide an audio file"
 
 
 
99
 
100
+ print(f"Received audio input: {audio_input}")
101
+
102
+ # Process audio
103
+ waveform = process_audio(audio_input)
104
+
105
+ if waveform is None or len(waveform) == 0:
106
+ return "Error: Invalid audio file", "Please provide a valid audio file"
107
+
108
+ # Transcribe audio
109
+ print("Transcribing audio...")
110
+ inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
111
+
112
+ with torch.no_grad():
113
+ predicted_ids = whisper_model.generate(inputs)
114
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
115
+
116
+ print(f"Transcription completed: {transcription}")
117
+
118
+ if not transcription or transcription.isspace():
119
+ return "No speech detected in audio", "Unable to analyze emotions without speech"
120
+
121
+ # Analyze emotions
122
+ print("Analyzing emotions...")
123
+ inputs = emotion_tokenizer(
124
+ transcription,
125
+ return_tensors="pt",
126
+ padding=True,
127
+ truncation=True,
128
+ max_length=512
129
  )
130
+
131
+ with torch.no_grad():
132
+ outputs = emotion_model(**inputs)
133
+ emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
134
+
135
+ emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
136
+ emotion_scores = {
137
+ label: float(score)
138
+ for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
139
+ }
140
+
141
+ print(f"Emotion analysis completed: {emotion_scores}")
142
+
143
+ # Create visualization
144
+ emotion_viz = create_emotion_plot(emotion_scores)
145
+
146
+ return transcription, emotion_viz
147
+
148
+ except FileNotFoundError as e:
149
+ error_msg = f"Audio file not found: {str(e)}"
150
+ print(error_msg)
151
+ return error_msg, "Please provide a valid audio file"
152
  except Exception as e:
153
+ error_msg = f"Error analyzing audio: {str(e)}"
154
  print(error_msg)
155
  return error_msg, "Error in analysis"
156
 
157
+ # Load models at startup
158
+ print("Initializing application...")
159
+ if not load_models():
160
+ raise RuntimeError("Failed to load required models")
161
+
162
+ # Create Gradio interface
163
+ demo = gr.Interface(
164
+ fn=analyze_audio,
165
+ inputs=gr.Audio(
166
+ source="microphone",
167
+ type="filepath",
168
+ label="Audio Input"
169
+ ),
170
+ outputs=[
171
+ gr.Textbox(label="Transcription"),
172
+ gr.HTML(label="Emotion Analysis")
173
+ ],
174
+ title="Vocal Emotion Analysis",
175
+ description="""
176
+ This app analyzes voice recordings to:
177
+ 1. Transcribe speech to text
178
+ 2. Detect emotions in the speech
179
+
180
+ Upload an audio file or record directly through your microphone.
181
+ """,
182
+ article="""
183
+ Models used:
184
+ - Speech recognition: Whisper (tiny)
185
+ - Emotion detection: DistilRoBERTa
186
+
187
+ Note: Processing may take a few moments depending on the length of the audio.
188
+ """,
189
+ examples=None,
190
+ cache_examples=False
191
+ )
192
+
193
  if __name__ == "__main__":
194
+ demo.launch(debug=True)