VincentGOURBIN commited on
Commit
48397c5
ยท
verified ยท
1 Parent(s): 49d38f7

Upload folder using huggingface_hub

Browse files
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """MeetingNotes Hugging Face Spaces package."""
src/ai/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """AI modules for HF Spaces version."""
2
+
3
+ from .voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
4
+ from .diarization import SpeakerDiarization
5
+ from .prompts_config import VoxtralPrompts
6
+
7
+ __all__ = ['VoxtralSpacesAnalyzer', 'SpeakerDiarization', 'VoxtralPrompts']
src/ai/diarization.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speaker diarization module for HF Spaces with Zero GPU support.
3
+
4
+ This module uses pyannote/speaker-diarization-3.1 to identify
5
+ and segment different speakers in an audio file, optimized for HF Spaces.
6
+ """
7
+
8
+ import torch
9
+ import torchaudio
10
+ from pyannote.audio import Pipeline
11
+ from typing import Optional, Dict, Any, List, Tuple
12
+ import tempfile
13
+ import os
14
+ from pydub import AudioSegment
15
+ import time
16
+
17
+ from ..utils.zero_gpu_manager import gpu_model_loading, gpu_inference, ZeroGPUManager
18
+
19
+
20
+ class SpeakerDiarization:
21
+ """
22
+ Speaker diarization using pyannote/speaker-diarization-3.1 for HF Spaces.
23
+
24
+ This class handles automatic speaker diarization
25
+ with Zero GPU decorators for efficient compute allocation.
26
+ """
27
+
28
+ def __init__(self, hf_token: str = None):
29
+ """
30
+ Initialize the pyannote diarizer for HF Spaces.
31
+
32
+ Args:
33
+ hf_token (str): Hugging Face token to access the model
34
+ """
35
+ self.hf_token = hf_token or os.getenv("HF_TOKEN")
36
+ self.pipeline = None
37
+ self.gpu_manager = ZeroGPUManager()
38
+ print("๐Ÿ”„ Initializing pyannote diarizer for HF Spaces...")
39
+
40
+ @gpu_model_loading(duration=90)
41
+ def _load_pipeline(self):
42
+ """Load diarization pipeline with GPU allocation if not already loaded."""
43
+ if self.pipeline is None:
44
+ print("๐Ÿ“ฅ Loading pyannote/speaker-diarization-3.1 model...")
45
+ self.pipeline = Pipeline.from_pretrained(
46
+ "pyannote/speaker-diarization-3.1",
47
+ use_auth_token=self.hf_token
48
+ )
49
+
50
+ # Use GPU if available (CUDA or MPS)
51
+ if self.gpu_manager.is_gpu_available():
52
+ device = self.gpu_manager.get_device()
53
+ if device == "mps":
54
+ # MPS support for local Mac testing
55
+ self.pipeline = self.pipeline.to(torch.device("mps"))
56
+ print("๐Ÿš€ Pyannote pipeline loaded on MPS (Apple Silicon)")
57
+ elif device == "cuda":
58
+ self.pipeline = self.pipeline.to(torch.device("cuda"))
59
+ print("๐Ÿš€ Pyannote pipeline loaded on CUDA")
60
+ else:
61
+ print("โš ๏ธ Pyannote pipeline loaded on CPU")
62
+ else:
63
+ print("โš ๏ธ Pyannote pipeline loaded on CPU")
64
+
65
+ @gpu_inference(duration=180)
66
+ def diarize_audio(self, audio_path: str, num_speakers: Optional[int] = None) -> Tuple[str, List[Dict]]:
67
+ """
68
+ Perform speaker diarization on an audio file with Zero GPU.
69
+
70
+ Args:
71
+ audio_path (str): Path to the audio file
72
+ num_speakers (Optional[int]): Expected number of speakers (optional)
73
+
74
+ Returns:
75
+ Tuple[str, List[Dict]]: (RTTM result, List of reference segments for each speaker)
76
+ """
77
+ try:
78
+ # Load pipeline if necessary
79
+ self._load_pipeline()
80
+
81
+ print(f"๐ŸŽค Starting diarization: {audio_path}")
82
+
83
+ # Prepare audio file for pyannote (mono WAV)
84
+ processed_audio_path = self._prepare_audio_for_pyannote(audio_path)
85
+
86
+ # Diarization parameters
87
+ diarization_params = {}
88
+ if num_speakers is not None:
89
+ diarization_params["num_speakers"] = num_speakers
90
+ print(f"๐Ÿ‘ฅ Specified number of speakers: {num_speakers}")
91
+
92
+ # Perform diarization
93
+ print("๐Ÿ” Speaker analysis in progress...")
94
+ diarization = self.pipeline(processed_audio_path, **diarization_params)
95
+
96
+ # Convert to RTTM format
97
+ rttm_output = self._convert_to_rttm(diarization, audio_path)
98
+
99
+ # Extract reference segments (first long segments for each speaker)
100
+ try:
101
+ reference_segments = self._extract_reference_segments(diarization, audio_path, min_duration=5.0)
102
+ except Exception as ref_error:
103
+ print(f"โš ๏ธ Error extracting reference segments: {ref_error}")
104
+ reference_segments = []
105
+
106
+ print(f"โœ… Diarization completed: {len(diarization)} segments detected")
107
+ print(f"๐ŸŽค Reference segments created: {len(reference_segments)} speakers")
108
+
109
+ # Clean up temporary file if created
110
+ if processed_audio_path != audio_path:
111
+ try:
112
+ os.unlink(processed_audio_path)
113
+ except:
114
+ pass
115
+
116
+ return rttm_output, reference_segments
117
+
118
+ except Exception as e:
119
+ print(f"โŒ Error during diarization: {e}")
120
+ return f"โŒ Error during diarization: {str(e)}", []
121
+ finally:
122
+ # Clean up GPU memory
123
+ self.gpu_manager.cleanup_gpu()
124
+
125
+ def _prepare_audio_for_pyannote(self, audio_path: str) -> str:
126
+ """
127
+ Prepare audio file for pyannote (mono WAV if necessary).
128
+
129
+ Args:
130
+ audio_path (str): Path to original audio file
131
+
132
+ Returns:
133
+ str: Path to prepared audio file
134
+ """
135
+ try:
136
+ # Load audio with pydub to check format
137
+ audio = AudioSegment.from_file(audio_path)
138
+
139
+ # Check if conversion is needed (mono + WAV)
140
+ needs_conversion = (
141
+ audio.channels != 1 or # Not mono
142
+ not audio_path.lower().endswith('.wav') # Not WAV
143
+ )
144
+
145
+ if not needs_conversion:
146
+ print("๐ŸŽต Audio already in correct format for pyannote")
147
+ return audio_path
148
+
149
+ print("๐Ÿ”„ Converting audio for pyannote (mono WAV)...")
150
+
151
+ # Convert to mono WAV
152
+ mono_audio = audio.set_channels(1)
153
+
154
+ # Create temporary file
155
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
156
+ temp_path = tmp_file.name
157
+
158
+ # Export as mono WAV
159
+ mono_audio.export(temp_path, format="wav")
160
+
161
+ print(f"โœ… Audio converted: {temp_path}")
162
+ return temp_path
163
+
164
+ except Exception as e:
165
+ print(f"โš ๏ธ Audio conversion error: {e}, using original file")
166
+ return audio_path
167
+
168
+ def _convert_to_rttm(self, diarization, audio_file: str) -> str:
169
+ """
170
+ Convert diarization result to RTTM format.
171
+
172
+ Args:
173
+ diarization: Pyannote diarization object
174
+ audio_file (str): Audio filename for RTTM
175
+
176
+ Returns:
177
+ str: RTTM format content
178
+ """
179
+ rttm_lines = []
180
+
181
+ # RTTM header
182
+ audio_filename = os.path.basename(audio_file)
183
+
184
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
185
+ # RTTM format: SPEAKER file 1 start_time duration <NA> <NA> speaker_id <NA> <NA>
186
+ start_time = segment.start
187
+ duration = segment.duration
188
+
189
+ rttm_line = f"SPEAKER {audio_filename} 1 {start_time:.3f} {duration:.3f} <NA> <NA> {speaker} <NA> <NA>"
190
+ rttm_lines.append(rttm_line)
191
+
192
+ return "\n".join(rttm_lines)
193
+
194
+ def _extract_reference_segments(self, diarization, audio_path: str, min_duration: float = 5.0) -> List[Dict]:
195
+ """
196
+ Extract first long segment for each speaker as reference.
197
+
198
+ Args:
199
+ diarization: Pyannote diarization object
200
+ audio_path (str): Path to audio file
201
+ min_duration (float): Minimum duration in seconds for a reference segment
202
+
203
+ Returns:
204
+ List[Dict]: List of reference segments with metadata
205
+ """
206
+ reference_segments = []
207
+ speakers_found = set()
208
+
209
+ print(f"๐Ÿ” Searching for reference segments (>{min_duration}s) for each speaker...")
210
+
211
+ # Iterate through all segments to find first long segment of each speaker
212
+ try:
213
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
214
+ if speaker not in speakers_found and segment.duration >= min_duration:
215
+ print(f"๐Ÿ‘ค {speaker}: {segment.duration:.1f}s segment found ({segment.start:.1f}s-{segment.end:.1f}s)")
216
+
217
+ # Create audio snippet
218
+ snippet_path = self._create_audio_snippet(
219
+ audio_path,
220
+ segment.start,
221
+ segment.end,
222
+ speaker
223
+ )
224
+
225
+ if snippet_path:
226
+ reference_segments.append({
227
+ 'speaker': speaker,
228
+ 'start': segment.start,
229
+ 'end': segment.end,
230
+ 'duration': segment.duration,
231
+ 'audio_path': snippet_path
232
+ })
233
+ speakers_found.add(speaker)
234
+
235
+ # Fallback: if no long segments found for some speakers, take the longest
236
+ all_speakers_in_diarization = set(speaker for _, _, speaker in diarization.itertracks(yield_label=True))
237
+ if len(speakers_found) < len(all_speakers_in_diarization):
238
+ print("โš ๏ธ Some speakers don't have long segments, using longest segments...")
239
+ self._add_fallback_segments(diarization, audio_path, reference_segments, speakers_found, min_duration)
240
+
241
+ except Exception as iter_error:
242
+ print(f"โŒ Error iterating segments: {iter_error}")
243
+ reference_segments = []
244
+
245
+ return reference_segments
246
+
247
+ def _add_fallback_segments(self, diarization, audio_path: str, reference_segments: List[Dict],
248
+ speakers_found: set, min_duration: float):
249
+ """Add fallback segments for speakers without long segments."""
250
+ all_speakers = set(speaker for _, _, speaker in diarization.itertracks(yield_label=True))
251
+ missing_speakers = all_speakers - speakers_found
252
+
253
+ for speaker in missing_speakers:
254
+ # Find longest segment for this speaker
255
+ longest_segment = None
256
+ longest_duration = 0
257
+
258
+ for segment, _, spk in diarization.itertracks(yield_label=True):
259
+ if spk == speaker and segment.duration > longest_duration:
260
+ longest_segment = segment
261
+ longest_duration = segment.duration
262
+
263
+ if longest_segment and longest_duration > 1.0: # At least 1 second
264
+ print(f"๐Ÿ‘ค {speaker}: fallback segment of {longest_duration:.1f}s")
265
+
266
+ snippet_path = self._create_audio_snippet(
267
+ audio_path,
268
+ longest_segment.start,
269
+ longest_segment.end,
270
+ speaker
271
+ )
272
+
273
+ if snippet_path:
274
+ reference_segments.append({
275
+ 'speaker': speaker,
276
+ 'start': longest_segment.start,
277
+ 'end': longest_segment.end,
278
+ 'duration': longest_duration,
279
+ 'audio_path': snippet_path
280
+ })
281
+
282
+ def _create_audio_snippet(self, audio_path: str, start_time: float, end_time: float, speaker: str) -> Optional[str]:
283
+ """
284
+ Create temporary audio snippet for a speaker segment.
285
+
286
+ Args:
287
+ audio_path (str): Path to source audio file
288
+ start_time (float): Start in seconds
289
+ end_time (float): End in seconds
290
+ speaker (str): Speaker ID
291
+
292
+ Returns:
293
+ Optional[str]: Path to created temporary audio snippet or None if error
294
+ """
295
+ try:
296
+ # Load audio
297
+ audio = AudioSegment.from_file(audio_path)
298
+
299
+ # Convert to milliseconds
300
+ start_ms = int(start_time * 1000)
301
+ end_ms = int(end_time * 1000)
302
+
303
+ # Extract segment
304
+ segment = audio[start_ms:end_ms]
305
+
306
+ # Create temporary file
307
+ with tempfile.NamedTemporaryFile(
308
+ suffix=f"_{speaker}_{start_time:.1f}s.wav",
309
+ delete=False
310
+ ) as tmp_file:
311
+ snippet_path = tmp_file.name
312
+
313
+ # Export snippet to temporary file
314
+ segment.export(snippet_path, format="wav")
315
+
316
+ print(f"๐ŸŽต Temporary snippet created: {snippet_path}")
317
+ return snippet_path
318
+
319
+ except Exception as e:
320
+ print(f"โŒ Error creating snippet for {speaker}: {e}")
321
+ return None
322
+
323
+ def cleanup(self):
324
+ """Release pipeline resources."""
325
+ if self.pipeline is not None:
326
+ # Free GPU/MPS memory by moving to CPU
327
+ if hasattr(self.pipeline, 'to'):
328
+ try:
329
+ self.pipeline = self.pipeline.to(torch.device('cpu'))
330
+ except Exception as e:
331
+ print(f"โš ๏ธ Error moving to CPU: {e}")
332
+
333
+ del self.pipeline
334
+ self.pipeline = None
335
+
336
+ # Clean up memory
337
+ self.gpu_manager.cleanup_gpu()
338
+ print("๐Ÿงน Pyannote pipeline freed from memory")
src/ai/prompts_config.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized prompts configuration for Voxtral in HF Spaces.
3
+
4
+ This module contains all prompts used by Voxtral analyzers
5
+ for different types of analyses and processing modes.
6
+ """
7
+
8
+
9
+ class VoxtralPrompts:
10
+ """Class containing all system prompts for Voxtral."""
11
+
12
+ # ====================================
13
+ # AVAILABLE SECTIONS FOR SUMMARIES
14
+ # Note: Titles are in English but the AI will adapt language based on meeting content
15
+ # ====================================
16
+
17
+ AVAILABLE_SECTIONS = {
18
+ "resume_executif": {
19
+ "title": "## EXECUTIVE SUMMARY",
20
+ "description": "Overview of the purpose of this meeting segment and its outcomes",
21
+ "default_action": True,
22
+ "default_info": True
23
+ },
24
+ "discussions_principales": {
25
+ "title": "## MAIN DISCUSSIONS",
26
+ "description": "Main topics addressed and important points raised",
27
+ "default_action": True,
28
+ "default_info": False
29
+ },
30
+ "sujets_principaux": {
31
+ "title": "## MAIN TOPICS",
32
+ "description": "Key topics discussed and information presented",
33
+ "default_action": False,
34
+ "default_info": True
35
+ },
36
+ "plan_action": {
37
+ "title": "## ACTION PLAN",
38
+ "description": "Complete list of actions with:\n- Specific tasks and deliverables\n- Assigned responsibilities\n- Deadlines and timelines\n- Priority levels",
39
+ "default_action": True,
40
+ "default_info": False
41
+ },
42
+ "decisions_prises": {
43
+ "title": "## DECISIONS MADE",
44
+ "description": "All decisions made during this segment",
45
+ "default_action": True,
46
+ "default_info": False
47
+ },
48
+ "points_importants": {
49
+ "title": "## KEY POINTS",
50
+ "description": "Important discoveries, data or insights shared",
51
+ "default_action": False,
52
+ "default_info": True
53
+ },
54
+ "questions_discussions": {
55
+ "title": "## QUESTIONS & DISCUSSIONS",
56
+ "description": "Main questions asked and discussions held",
57
+ "default_action": False,
58
+ "default_info": True
59
+ },
60
+ "prochaines_etapes": {
61
+ "title": "## NEXT STEPS",
62
+ "description": "Follow-up actions and planned future meetings",
63
+ "default_action": True,
64
+ "default_info": False
65
+ },
66
+ "elements_suivi": {
67
+ "title": "## FOLLOW-UP ELEMENTS",
68
+ "description": "Follow-up information or clarifications needed",
69
+ "default_action": False,
70
+ "default_info": True
71
+ }
72
+ }
73
+
74
+ @staticmethod
75
+ def get_meeting_summary_prompt(selected_sections: list, speaker_references: str = "", chunk_info: str = "", previous_context: str = "") -> str:
76
+ """
77
+ Generate meeting summary prompt according to selected sections.
78
+
79
+ Args:
80
+ selected_sections (list): List of section keys to include
81
+ speaker_references (str): Diarization context with tags (optional)
82
+ chunk_info (str): Audio segment information (optional)
83
+ previous_context (str): Context from previous segments (optional)
84
+
85
+ Returns:
86
+ str: Formatted prompt
87
+ """
88
+ # Diarization context
89
+ diarization_context = ""
90
+ if speaker_references and speaker_references.strip():
91
+ diarization_context = f"""
92
+
93
+ CONTEXT FOR YOUR ANALYSIS (do not include in your response):
94
+ Different speakers have been automatically identified in the audio: {speaker_references}
95
+ Use this information to enrich your analysis but do not display it in your final response.
96
+
97
+ """
98
+
99
+ # Previous segments context
100
+ previous_summary_context = ""
101
+ if previous_context and previous_context.strip():
102
+ previous_summary_context = f"""
103
+
104
+ CONTEXT FROM PREVIOUS SEGMENTS (do not include in your response):
105
+ Here's what happened in previous audio segments:
106
+ {previous_context}
107
+
108
+ Use this information to ensure continuity and avoid repetitions, but focus on the new content of this segment.
109
+
110
+ """
111
+
112
+ # Audio segment information
113
+ segment_context = ""
114
+ if chunk_info and chunk_info.strip():
115
+ segment_context = f"""
116
+
117
+ IMPORTANT: You are analyzing a segment ({chunk_info}) extracted from a longer audio recording.
118
+ This segment may start or end in the middle of sentences/discussions.
119
+ Focus on the content of this segment while keeping in mind it's part of a larger whole.
120
+
121
+ """
122
+
123
+ # Build selected sections
124
+ sections_text = ""
125
+ for section_key in selected_sections:
126
+ if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
127
+ section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
128
+ sections_text += f"\n{section['title']}\n{section['description']}\n"
129
+ print(f"โœ… Section added: {section['title']}")
130
+ else:
131
+ print(f"โŒ Unknown section: {section_key}")
132
+
133
+ return f"""Listen carefully to this meeting audio segment and provide a complete structured summary.{diarization_context}{previous_summary_context}{segment_context}
134
+
135
+ CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
136
+ - DETECT the language spoken in this audio
137
+ - RESPOND OBLIGATORILY in the same detected language
138
+ - If audio is in French โ†’ respond in French
139
+ - If audio is in English โ†’ respond in English
140
+ - If audio is in another language โ†’ respond in that language
141
+ - NEVER use a different language than the one detected in the audio
142
+
143
+ {sections_text}
144
+ Format your response in markdown exactly as shown above."""
145
+
146
+ @staticmethod
147
+ def get_default_sections(meeting_type: str) -> list:
148
+ """
149
+ Return default sections according to meeting type.
150
+
151
+ Args:
152
+ meeting_type (str): "action" or "information"
153
+
154
+ Returns:
155
+ list: List of default section keys
156
+ """
157
+ if "action" in meeting_type.lower():
158
+ return [key for key, section in VoxtralPrompts.AVAILABLE_SECTIONS.items()
159
+ if section["default_action"]]
160
+ else:
161
+ return [key for key, section in VoxtralPrompts.AVAILABLE_SECTIONS.items()
162
+ if section["default_info"]]
163
+
164
+ @staticmethod
165
+ def get_synthesis_prompt(selected_sections: list, chunk_summaries: list) -> str:
166
+ """
167
+ Generate prompt for synthesizing multiple chunk summaries.
168
+
169
+ Args:
170
+ selected_sections (list): List of requested section keys
171
+ chunk_summaries (list): List of chunk summaries to synthesize
172
+
173
+ Returns:
174
+ str: Formatted synthesis prompt
175
+ """
176
+ # Build selected sections
177
+ sections_text = ""
178
+ for section_key in selected_sections:
179
+ if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
180
+ section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
181
+ sections_text += f"\n{section['title']}\n{section['description']}\n"
182
+
183
+ # Assemble all chunk summaries
184
+ all_chunks_text = "\n\n=== SEGMENT SEPARATOR ===\n\n".join(chunk_summaries)
185
+
186
+ return f"""You will receive multiple analyses of segments from the same audio meeting.
187
+ Your role is to synthesize them into a coherent and structured global summary.
188
+
189
+ SEGMENT ANALYSES TO SYNTHESIZE:
190
+ {all_chunks_text}
191
+
192
+ CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
193
+ - DETECT the language used in the segments above
194
+ - RESPOND OBLIGATORILY in the same detected language
195
+ - If segments are in French โ†’ respond in French
196
+ - If segments are in English โ†’ respond in English
197
+ - Avoid repetitions between segments
198
+ - Identify recurring elements and unify them
199
+ - Ensure temporal and logical coherence
200
+ - Produce a global summary that reflects the entire meeting
201
+
202
+ Generate a final structured summary according to these sections:
203
+ {sections_text}
204
+ Format your response in markdown exactly as shown above."""
src/ai/voxtral_spaces_analyzer.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voxtral analyzer optimized for Hugging Face Spaces.
3
+
4
+ This module provides audio analysis using Voxtral models with:
5
+ - Only Transformers backend (no MLX or API)
6
+ - Only 8-bit quantized models for memory efficiency
7
+ - Zero GPU decorators for HF Spaces compute allocation
8
+ - Optimized memory management for Spaces environment
9
+ """
10
+
11
+ import torch
12
+ import torchaudio
13
+ import tempfile
14
+ import time
15
+ import gc
16
+ import os
17
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
18
+ from pydub import AudioSegment
19
+ from typing import List, Dict, Tuple, Optional
20
+
21
+ from ..utils.zero_gpu_manager import gpu_model_loading, gpu_inference, gpu_long_task, ZeroGPUManager
22
+ from .prompts_config import VoxtralPrompts
23
+ from ..utils.token_tracker import TokenTracker
24
+
25
+
26
+ class VoxtralSpacesAnalyzer:
27
+ """
28
+ Voxtral analyzer optimized for Hugging Face Spaces.
29
+
30
+ Features:
31
+ - Only 8-bit quantized models
32
+ - Zero GPU decorators for efficient compute allocation
33
+ - Memory-optimized processing for Spaces constraints
34
+ """
35
+
36
+ def __init__(self, model_name: str = "Voxtral-Mini-3B-2507"):
37
+ """
38
+ Initialize the Voxtral analyzer for HF Spaces.
39
+
40
+ Args:
41
+ model_name (str): Name of the Voxtral model to use (8-bit only)
42
+ """
43
+ # Only 8-bit models are supported in Spaces version
44
+ model_mapping = {
45
+ "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
46
+ "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
47
+ }
48
+
49
+ self.model_name = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
50
+ self.max_duration_minutes = 20 # Reduced for Spaces environment
51
+ self.gpu_manager = ZeroGPUManager()
52
+
53
+ # Model and processor will be loaded on-demand with GPU decorators
54
+ self.model = None
55
+ self.processor = None
56
+ self.token_tracker = TokenTracker("Transformers-8bit")
57
+
58
+ print(f"๐Ÿš€ VoxtralSpacesAnalyzer initialized for model: {model_name}")
59
+
60
+ @gpu_model_loading(duration=120)
61
+ def _load_model_if_needed(self):
62
+ """Load model and processor with GPU allocation if not already loaded."""
63
+ if self.model is not None and self.processor is not None:
64
+ return
65
+
66
+ device = self.gpu_manager.get_device()
67
+ dtype = self.gpu_manager.dtype
68
+ print(f"๐Ÿ”„ Loading Voxtral model on {device} with {dtype}...")
69
+
70
+ # Load processor
71
+ self.processor = AutoProcessor.from_pretrained(self.model_name)
72
+
73
+ # Model loading strategy based on device and environment
74
+ if self.gpu_manager.is_spaces_environment() and device == "cuda":
75
+ # HF Spaces with CUDA: use 8-bit quantization
76
+ print("๐Ÿ“ฆ Loading with 8-bit quantization for HF Spaces")
77
+ self.model = VoxtralForConditionalGeneration.from_pretrained(
78
+ self.model_name,
79
+ load_in_8bit=True,
80
+ device_map="auto",
81
+ torch_dtype=dtype,
82
+ low_cpu_mem_usage=True
83
+ )
84
+ elif device == "mps":
85
+ # Local Mac with MPS: standard loading with MPS-compatible settings
86
+ print("๐Ÿ“ฆ Loading with MPS optimization for local Mac testing")
87
+ self.model = VoxtralForConditionalGeneration.from_pretrained(
88
+ self.model_name,
89
+ torch_dtype=dtype,
90
+ low_cpu_mem_usage=True
91
+ )
92
+ self.model = self.model.to(device)
93
+ elif device == "cuda":
94
+ # Local CUDA: can use more aggressive optimizations
95
+ print("๐Ÿ“ฆ Loading with CUDA optimization for local testing")
96
+ self.model = VoxtralForConditionalGeneration.from_pretrained(
97
+ self.model_name,
98
+ torch_dtype=dtype,
99
+ low_cpu_mem_usage=True,
100
+ device_map="auto"
101
+ )
102
+ else:
103
+ # CPU fallback
104
+ print("๐Ÿ“ฆ Loading on CPU")
105
+ self.model = VoxtralForConditionalGeneration.from_pretrained(
106
+ self.model_name,
107
+ torch_dtype=dtype,
108
+ low_cpu_mem_usage=True
109
+ )
110
+
111
+ print(f"โœ… Model loaded successfully on {device}")
112
+
113
+ # Print memory info if available
114
+ if self.gpu_manager.is_gpu_available():
115
+ memory_info = self.gpu_manager.get_memory_info()
116
+ if memory_info["available"]:
117
+ if memory_info["device"] == "cuda":
118
+ allocated_gb = memory_info["allocated"] / (1024**3)
119
+ print(f"๐Ÿ“Š CUDA Memory allocated: {allocated_gb:.2f}GB")
120
+ elif memory_info["device"] == "mps":
121
+ allocated_mb = memory_info["allocated"] / (1024**2)
122
+ print(f"๐Ÿ“Š MPS Memory allocated: {allocated_mb:.1f}MB")
123
+
124
+ def _get_audio_duration(self, wav_path: str) -> float:
125
+ """Get audio duration in minutes."""
126
+ audio = AudioSegment.from_file(wav_path)
127
+ return len(audio) / (1000 * 60)
128
+
129
+ def _create_time_chunks(self, wav_path: str) -> List[Tuple[float, float]]:
130
+ """Create time-based chunks for processing."""
131
+ total_duration = self._get_audio_duration(wav_path) * 60 # seconds
132
+ max_chunk_seconds = self.max_duration_minutes * 60
133
+
134
+ if total_duration <= max_chunk_seconds:
135
+ return [(0, total_duration)]
136
+
137
+ chunks = []
138
+ current_start = 0
139
+
140
+ while current_start < total_duration:
141
+ chunk_end = min(current_start + max_chunk_seconds, total_duration)
142
+ chunks.append((current_start, chunk_end))
143
+ current_start = chunk_end
144
+
145
+ return chunks
146
+
147
+ def _extract_audio_chunk(self, wav_path: str, start_time: float, end_time: float) -> str:
148
+ """Extract audio chunk between timestamps."""
149
+ audio = AudioSegment.from_file(wav_path)
150
+
151
+ start_ms = int(start_time * 1000)
152
+ end_ms = int(end_time * 1000)
153
+
154
+ chunk = audio[start_ms:end_ms]
155
+
156
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_chunk:
157
+ chunk_path = tmp_chunk.name
158
+
159
+ chunk.export(chunk_path, format="wav")
160
+ return chunk_path
161
+
162
+ @gpu_long_task(duration=300)
163
+ def analyze_audio_chunks(
164
+ self,
165
+ wav_path: str,
166
+ language: str = "french",
167
+ selected_sections: list = None,
168
+ chunk_duration_minutes: int = 15,
169
+ reference_speakers_data: str = None
170
+ ) -> Dict[str, str]:
171
+ """
172
+ Analyze audio by chunks using Voxtral with Zero GPU.
173
+
174
+ Args:
175
+ wav_path (str): Path to audio file
176
+ language (str): Expected language
177
+ selected_sections (list): Analysis sections to include
178
+ chunk_duration_minutes (int): Chunk duration in minutes
179
+ reference_speakers_data (str): Speaker diarization data
180
+
181
+ Returns:
182
+ Dict[str, str]: Analysis results
183
+ """
184
+ try:
185
+ # Ensure model is loaded
186
+ self._load_model_if_needed()
187
+
188
+ total_start_time = time.time()
189
+ duration = self._get_audio_duration(wav_path)
190
+ print(f"๐ŸŽต Audio duration: {duration:.1f} minutes")
191
+
192
+ # Create chunks
193
+ chunks = self._create_time_chunks(wav_path)
194
+ print(f"๐Ÿ“ฆ Splitting into {len(chunks)} chunks")
195
+
196
+ chunk_summaries = []
197
+
198
+ for i, (start_time, end_time) in enumerate(chunks):
199
+ print(f"๐ŸŽฏ Processing chunk {i+1}/{len(chunks)} ({start_time/60:.1f}-{end_time/60:.1f}min)")
200
+
201
+ chunk_start_time = time.time()
202
+ chunk_path = self._extract_audio_chunk(wav_path, start_time, end_time)
203
+
204
+ try:
205
+ # Analyze chunk with Zero GPU
206
+ chunk_summary = self._analyze_single_chunk(
207
+ chunk_path,
208
+ selected_sections,
209
+ reference_speakers_data,
210
+ i + 1,
211
+ len(chunks),
212
+ start_time,
213
+ end_time
214
+ )
215
+
216
+ chunk_summaries.append(f"## Segment {i+1} ({start_time/60:.1f}-{end_time/60:.1f}min)\n\n{chunk_summary}")
217
+
218
+ chunk_duration = time.time() - chunk_start_time
219
+ print(f"โœ… Chunk {i+1} analyzed in {chunk_duration:.1f}s")
220
+
221
+ except Exception as e:
222
+ print(f"โŒ Error processing chunk {i+1}: {e}")
223
+ chunk_summaries.append(f"**Segment {i+1}:** Processing error")
224
+ finally:
225
+ # Clean up chunk file
226
+ if os.path.exists(chunk_path):
227
+ os.remove(chunk_path)
228
+
229
+ # GPU cleanup after each chunk
230
+ self.gpu_manager.cleanup_gpu()
231
+
232
+ # Final synthesis if multiple chunks
233
+ if len(chunk_summaries) > 1:
234
+ print(f"๐Ÿ”„ Final synthesis of {len(chunk_summaries)} segments...")
235
+ combined_content = "\n\n".join(chunk_summaries)
236
+ final_analysis = self._synthesize_chunks_final(combined_content, selected_sections)
237
+ else:
238
+ final_analysis = chunk_summaries[0] if chunk_summaries else "No analysis available."
239
+
240
+ total_duration = time.time() - total_start_time
241
+ print(f"โฑ๏ธ Total analysis completed in {total_duration:.1f}s for {duration:.1f}min of audio")
242
+
243
+ # Print token usage
244
+ self.token_tracker.print_summary()
245
+
246
+ return {"transcription": final_analysis}
247
+
248
+ finally:
249
+ # Final GPU cleanup
250
+ self.gpu_manager.cleanup_gpu()
251
+
252
+ @gpu_inference(duration=120)
253
+ def _analyze_single_chunk(
254
+ self,
255
+ chunk_path: str,
256
+ selected_sections: list,
257
+ reference_speakers_data: str,
258
+ chunk_num: int,
259
+ total_chunks: int,
260
+ start_time: float,
261
+ end_time: float
262
+ ) -> str:
263
+ """Analyze a single audio chunk with GPU inference."""
264
+ # Build analysis prompt
265
+ sections_list = selected_sections if selected_sections else ["resume_executif"]
266
+ chunk_info = f"SEGMENT {chunk_num}/{total_chunks} ({start_time/60:.1f}-{end_time/60:.1f}min)" if total_chunks > 1 else None
267
+
268
+ prompt_text = VoxtralPrompts.get_meeting_summary_prompt(
269
+ sections_list,
270
+ reference_speakers_data,
271
+ chunk_info,
272
+ None
273
+ )
274
+
275
+ # Create conversation for audio instruct mode
276
+ conversation = [{
277
+ "role": "user",
278
+ "content": [
279
+ {"type": "audio", "path": chunk_path},
280
+ {"type": "text", "text": prompt_text},
281
+ ],
282
+ }]
283
+
284
+ # Process with chat template
285
+ inputs = self.processor.apply_chat_template(conversation, return_tensors="pt")
286
+ device = self.gpu_manager.get_device()
287
+ dtype = self.gpu_manager.dtype if hasattr(self.gpu_manager, 'dtype') else torch.float16
288
+
289
+ # Move inputs to device with appropriate dtype
290
+ if hasattr(inputs, 'to'):
291
+ inputs = inputs.to(device, dtype=dtype)
292
+ else:
293
+ # Handle BatchFeature or dict-like inputs
294
+ inputs = {k: v.to(device, dtype=dtype) if hasattr(v, 'to') else v for k, v in inputs.items()}
295
+
296
+ # Generate with optimized settings for Spaces
297
+ with torch.no_grad():
298
+ outputs = self.model.generate(
299
+ **inputs,
300
+ max_new_tokens=8000, # Reduced for 8-bit model efficiency
301
+ temperature=0.2,
302
+ do_sample=True,
303
+ pad_token_id=self.processor.tokenizer.eos_token_id,
304
+ use_cache=True,
305
+ output_scores=False
306
+ )
307
+
308
+ # Decode response
309
+ input_tokens = inputs.input_ids.shape[1]
310
+ output_tokens_count = outputs.shape[1] - input_tokens
311
+
312
+ chunk_summary = self.processor.batch_decode(
313
+ outputs[:, inputs.input_ids.shape[1]:],
314
+ skip_special_tokens=True
315
+ )[0].strip()
316
+
317
+ # Track tokens
318
+ self.token_tracker.add_chunk_tokens(input_tokens, output_tokens_count)
319
+
320
+ return chunk_summary
321
+
322
+ @gpu_inference(duration=60)
323
+ def _synthesize_chunks_final(self, combined_content: str, selected_sections: list) -> str:
324
+ """Final synthesis of all chunks with GPU inference."""
325
+ try:
326
+ # Build synthesis prompt
327
+ sections_text = ""
328
+ if selected_sections:
329
+ for section_key in selected_sections:
330
+ if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
331
+ section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
332
+ sections_text += f"\n{section['title']}\n{section['description']}\n"
333
+
334
+ synthesis_prompt = f"""Here are detailed analyses from multiple meeting segments:
335
+
336
+ {combined_content}
337
+
338
+ CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
339
+ - DETECT the language used in the segments above
340
+ - RESPOND OBLIGATORILY in the same detected language
341
+ - If segments are in French โ†’ respond in French
342
+ - If segments are in English โ†’ respond in English
343
+
344
+ Now synthesize these analyses into a coherent global summary structured according to the requested sections:{sections_text}
345
+
346
+ Provide a unified synthesis that combines and summarizes information from all segments coherently."""
347
+
348
+ # Generate synthesis
349
+ conversation = [{"role": "user", "content": synthesis_prompt}]
350
+ inputs = self.processor.apply_chat_template(conversation, return_tensors="pt")
351
+ device = self.gpu_manager.get_device()
352
+ dtype = self.gpu_manager.dtype if hasattr(self.gpu_manager, 'dtype') else torch.float16
353
+
354
+ # Move inputs to device with appropriate dtype
355
+ if hasattr(inputs, 'to'):
356
+ inputs = inputs.to(device, dtype=dtype)
357
+ else:
358
+ inputs = {k: v.to(device, dtype=dtype) if hasattr(v, 'to') else v for k, v in inputs.items()}
359
+
360
+ with torch.no_grad():
361
+ outputs = self.model.generate(
362
+ **inputs,
363
+ max_new_tokens=3000, # Reduced for 8-bit efficiency
364
+ temperature=0.1,
365
+ do_sample=True,
366
+ pad_token_id=self.processor.tokenizer.eos_token_id
367
+ )
368
+
369
+ # Decode synthesis
370
+ input_length = inputs.input_ids.shape[1]
371
+ output_tokens_count = outputs.shape[1] - input_length
372
+
373
+ final_synthesis = self.processor.tokenizer.decode(
374
+ outputs[0][input_length:],
375
+ skip_special_tokens=True
376
+ ).strip()
377
+
378
+ self.token_tracker.add_synthesis_tokens(input_length, output_tokens_count)
379
+
380
+ return f"# Global Meeting Summary\n\n{final_synthesis}\n\n---\n\n## Details by Segment\n\n{combined_content}"
381
+
382
+ except Exception as e:
383
+ print(f"โŒ Error during final synthesis: {e}")
384
+ return f"# Meeting Summary\n\nโš ๏ธ Error during final synthesis: {str(e)}\n\n## Segment Analyses\n\n{combined_content}"
385
+
386
+ def cleanup_model(self):
387
+ """Clean up model from memory."""
388
+ if self.model is not None:
389
+ self.model.to('cpu')
390
+ del self.model
391
+ self.model = None
392
+
393
+ if self.processor is not None:
394
+ del self.processor
395
+ self.processor = None
396
+
397
+ self.gpu_manager.cleanup_gpu()
398
+ print("๐Ÿงน Voxtral Spaces model cleaned up")
src/ui/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """UI components for HF Spaces version."""
src/ui/spaces_interface.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Application Gradio pour l'analyse intelligente de rรฉunions avec Voxtral - Version HF Spaces.
3
+
4
+ Version adaptรฉe pour Hugging Face Spaces avec :
5
+ - Uniquement mode Transformers (MLX et API supprimรฉs)
6
+ - Modรจles 8-bit uniquement
7
+ - Support MCP natif
8
+ - Zero GPU decorators
9
+ """
10
+
11
+ import os
12
+ import gradio as gr
13
+ from dotenv import load_dotenv
14
+
15
+ from ..ai.voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
16
+ from ..ai.diarization import SpeakerDiarization
17
+ from ..utils.zero_gpu_manager import ZeroGPUManager, gpu_inference
18
+
19
+ # Import labels from main project
20
+ import sys
21
+ import os
22
+ sys.path.append(os.path.join(os.path.dirname(__file__), '../../../src'))
23
+ from meetingnotes.ui.labels import UILabels
24
+
25
+ # Charger les variables d'environnement depuis le fichier .env
26
+ load_dotenv()
27
+
28
+ # Global instances for MCP functions
29
+ analyzer = None
30
+ diarization = None
31
+ gpu_manager = None
32
+ current_diarization_context = None
33
+
34
+ def initialize_components():
35
+ """Initialize global components for MCP functions."""
36
+ global analyzer, diarization, gpu_manager
37
+ if analyzer is None:
38
+ analyzer = VoxtralSpacesAnalyzer()
39
+ diarization = SpeakerDiarization()
40
+ gpu_manager = ZeroGPUManager()
41
+
42
+ # MCP Tools - exposed automatically by Gradio
43
+ @gpu_inference(duration=300)
44
+ def analyze_meeting_audio(
45
+ audio_file: str,
46
+ sections: list = None,
47
+ model_name: str = "Voxtral-Mini-3B-2507",
48
+ enable_diarization: bool = False,
49
+ num_speakers: int = None
50
+ ) -> dict:
51
+ """
52
+ Analyze meeting audio and generate structured summaries using Voxtral AI.
53
+
54
+ This function processes audio files to extract insights, identify speakers,
55
+ and generate structured meeting summaries with configurable sections.
56
+
57
+ Args:
58
+ audio_file: Path to the audio file to analyze (MP3, WAV, M4A, OGG)
59
+ sections: List of analysis sections to include (executive_summary, action_plan, etc.)
60
+ model_name: Voxtral model to use for analysis (Mini-3B or Small-24B)
61
+ enable_diarization: Whether to identify and separate speakers
62
+ num_speakers: Expected number of speakers (optional, for better diarization)
63
+
64
+ Returns:
65
+ Dictionary containing analysis results, processing time, and metadata
66
+ """
67
+ initialize_components()
68
+
69
+ if not os.path.exists(audio_file):
70
+ return {"error": "Audio file not found", "status": "failed"}
71
+
72
+ try:
73
+ import time
74
+ start_time = time.time()
75
+
76
+ # Set default sections if none provided
77
+ if sections is None:
78
+ sections = ["resume_executif", "discussions_principales", "plan_action"]
79
+
80
+ # Speaker diarization if enabled
81
+ speaker_data = None
82
+ if enable_diarization:
83
+ rttm_result, reference_segments = diarization.diarize_audio(
84
+ audio_file, num_speakers=num_speakers
85
+ )
86
+ if not rttm_result.startswith("โŒ"):
87
+ speaker_data = rttm_result
88
+
89
+ # Set model if different
90
+ if analyzer.model_name != f"mistralai/{model_name}":
91
+ analyzer.model_name = f"mistralai/{model_name}"
92
+ analyzer.cleanup_model()
93
+
94
+ # Analyze audio
95
+ results = analyzer.analyze_audio_chunks(
96
+ wav_path=audio_file,
97
+ language="auto",
98
+ selected_sections=sections,
99
+ chunk_duration_minutes=15,
100
+ reference_speakers_data=speaker_data
101
+ )
102
+
103
+ processing_time = time.time() - start_time
104
+
105
+ return {
106
+ "status": "completed",
107
+ "analysis": results.get("transcription", "No analysis available"),
108
+ "processing_time_seconds": processing_time,
109
+ "model_used": model_name,
110
+ "sections_analyzed": sections,
111
+ "diarization_enabled": enable_diarization
112
+ }
113
+
114
+ except Exception as e:
115
+ return {
116
+ "status": "failed",
117
+ "error": str(e),
118
+ "processing_time_seconds": time.time() - start_time if 'start_time' in locals() else 0
119
+ }
120
+ finally:
121
+ if gpu_manager:
122
+ gpu_manager.cleanup_gpu()
123
+
124
+ def get_available_sections() -> dict:
125
+ """Get available analysis sections for meeting summaries."""
126
+ from meetingnotes.ai.prompts_config import VoxtralPrompts
127
+ return {
128
+ "status": "success",
129
+ "sections": VoxtralPrompts.AVAILABLE_SECTIONS,
130
+ "total_sections": len(VoxtralPrompts.AVAILABLE_SECTIONS)
131
+ }
132
+
133
+ def get_meeting_templates() -> dict:
134
+ """Get pre-configured meeting analysis templates."""
135
+ templates = {
136
+ "action_meeting": {
137
+ "name": "Action-Oriented Meeting",
138
+ "description": "For meetings focused on decisions and action items",
139
+ "recommended_sections": ["resume_executif", "discussions_principales", "plan_action", "decisions_prises", "prochaines_etapes"]
140
+ },
141
+ "info_meeting": {
142
+ "name": "Information Meeting",
143
+ "description": "For presentations and informational sessions",
144
+ "recommended_sections": ["resume_executif", "sujets_principaux", "points_importants", "questions_discussions", "elements_suivi"]
145
+ }
146
+ }
147
+ return {"status": "success", "templates": templates, "total_templates": len(templates)}
148
+
149
+ # Handlers adaptรฉs pour HF Spaces
150
+ def handle_input_mode_change(input_mode):
151
+ """Gestion du changement de mode d'entrรฉe."""
152
+ if input_mode == UILabels.INPUT_MODE_AUDIO:
153
+ return gr.update(visible=True), gr.update(visible=False)
154
+ else:
155
+ return gr.update(visible=False), gr.update(visible=True)
156
+
157
+ def extract_audio_from_video(video_file, language):
158
+ """Extraction audio depuis vidรฉo (placeholder pour HF Spaces)."""
159
+ if video_file is None:
160
+ return None, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
161
+
162
+ # Pour HF Spaces, on assume que le processing vidรฉo sera fait cรดtรฉ client
163
+ # ou qu'on accepte dรฉjร  des fichiers audio
164
+ return video_file, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
165
+
166
+ @gpu_inference(duration=180)
167
+ def handle_diarization(audio_file, hf_token, num_speakers, start_trim, end_trim):
168
+ """Gestion de la diarisation adaptรฉe pour HF Spaces."""
169
+ global current_diarization_context
170
+
171
+ initialize_components()
172
+
173
+ if audio_file is None:
174
+ return gr.update(choices=[], visible=False), None, gr.update(visible=False)
175
+
176
+ try:
177
+ # Diarisation avec les paramรจtres
178
+ rttm_result, reference_segments = diarization.diarize_audio(
179
+ audio_file, num_speakers=num_speakers
180
+ )
181
+
182
+ if rttm_result.startswith("โŒ"):
183
+ return gr.update(choices=[], visible=False), None, gr.update(visible=False)
184
+
185
+ # Sauvegarder le contexte pour l'analyse principale
186
+ current_diarization_context = rttm_result
187
+
188
+ # Crรฉer les boutons pour les locuteurs
189
+ speaker_choices = []
190
+ first_audio = None
191
+
192
+ for i, segment in enumerate(reference_segments):
193
+ speaker_id = segment['speaker']
194
+ speaker_choices.append((f"{speaker_id} ({segment['duration']:.1f}s)", speaker_id))
195
+ if i == 0: # Premier audio pour l'aperรงu
196
+ first_audio = segment['audio_path']
197
+
198
+ if speaker_choices:
199
+ return (
200
+ gr.update(choices=speaker_choices, value=speaker_choices[0][1], visible=True),
201
+ first_audio,
202
+ gr.update(visible=True)
203
+ )
204
+ else:
205
+ return gr.update(choices=[], visible=False), None, gr.update(visible=False)
206
+
207
+ except Exception as e:
208
+ print(f"Erreur diarisation: {e}")
209
+ return gr.update(choices=[], visible=False), None, gr.update(visible=False)
210
+
211
+ def handle_speaker_selection(selected_speaker, current_name):
212
+ """Gestion de la sรฉlection de locuteur."""
213
+ # Trouve le fichier audio correspondant au locuteur sรฉlectionnรฉ
214
+ # Pour simplifier, on retourne juste un placeholder
215
+ return None, f"Locuteur_{selected_speaker}"
216
+
217
+ def handle_speaker_rename(new_name):
218
+ """Gestion du renommage de locuteur."""
219
+ if new_name.strip():
220
+ renamed_info = f"Locuteur renommรฉ: {new_name}"
221
+ return gr.update(value=renamed_info, visible=True), gr.update(visible=True)
222
+ return gr.update(visible=False), gr.update(visible=False)
223
+
224
+ @gpu_inference(duration=300)
225
+ def handle_direct_transcription(
226
+ audio_file, hf_token, language, transcription_mode, model_key,
227
+ selected_sections, diarization_data, start_trim, end_trim, chunk_duration
228
+ ):
229
+ """Gestion de l'analyse directe adaptรฉe pour HF Spaces."""
230
+ initialize_components()
231
+
232
+ if audio_file is None:
233
+ return "", "โŒ Veuillez d'abord tรฉlรฉcharger un fichier audio."
234
+
235
+ try:
236
+ # Extraire le nom du modรจle depuis transcription_mode
237
+ if "Mini" in transcription_mode:
238
+ model_name = "Voxtral-Mini-3B-2507"
239
+ else:
240
+ model_name = "Voxtral-Small-24B-2507"
241
+
242
+ # Configurer l'analyseur
243
+ if analyzer.model_name != f"mistralai/{model_name}":
244
+ analyzer.model_name = f"mistralai/{model_name}"
245
+ analyzer.cleanup_model()
246
+
247
+ # Lancer l'analyse
248
+ results = analyzer.analyze_audio_chunks(
249
+ wav_path=audio_file,
250
+ language="auto",
251
+ selected_sections=selected_sections,
252
+ chunk_duration_minutes=int(chunk_duration),
253
+ reference_speakers_data=diarization_data
254
+ )
255
+
256
+ return "", results.get("transcription", "Aucune analyse disponible")
257
+
258
+ except Exception as e:
259
+ error_msg = f"โŒ Erreur lors de l'analyse: {str(e)}"
260
+ return "", error_msg
261
+ finally:
262
+ if gpu_manager:
263
+ gpu_manager.cleanup_gpu()
264
+
265
+ def create_spaces_interface():
266
+ """
267
+ Point d'entrรฉe principal pour l'interface HF Spaces.
268
+
269
+ Interface identique au projet original mais simplifiรฉe :
270
+ - Seul mode Transformers (pas MLX/API)
271
+ - Modรจles 8-bit uniquement
272
+ - Support MCP natif
273
+ """
274
+ # Initialize components
275
+ initialize_components()
276
+
277
+ # Rรฉcupรฉrer le token Hugging Face depuis les variables d'environnement
278
+ hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
279
+ if hf_token is None:
280
+ print("โš ๏ธ Warning: HF_TOKEN environment variable not found")
281
+
282
+ # Configuration du thรจme Glass personnalisรฉ (identique ร  l'original)
283
+ custom_glass_theme = gr.themes.Glass(
284
+ primary_hue=gr.themes.colors.blue,
285
+ secondary_hue=gr.themes.colors.gray,
286
+ text_size=gr.themes.sizes.text_md,
287
+ spacing_size=gr.themes.sizes.spacing_md,
288
+ radius_size=gr.themes.sizes.radius_md
289
+ )
290
+
291
+ with gr.Blocks(
292
+ theme=custom_glass_theme,
293
+ title="MeetingNotes - AI Analysis with Voxtral",
294
+ css="""
295
+ .gradio-container {
296
+ max-width: 1200px !important;
297
+ margin: 0 auto !important;
298
+ }
299
+ .main-header {
300
+ text-align: center;
301
+ margin-bottom: 30px;
302
+ padding: 20px;
303
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
304
+ border-radius: 15px;
305
+ color: white;
306
+ box-shadow: 0 8px 32px rgba(31, 38, 135, 0.37);
307
+ }
308
+ .processing-section {
309
+ background: rgba(255, 255, 255, 0.1);
310
+ border-radius: 10px;
311
+ padding: 20px;
312
+ margin: 15px 0;
313
+ border: 1px solid rgba(255, 255, 255, 0.2);
314
+ backdrop-filter: blur(10px);
315
+ }
316
+ .results-section {
317
+ margin-top: 25px;
318
+ }
319
+ """
320
+ ) as demo:
321
+ # Main header with style (identique ร  l'original)
322
+ with gr.Column(elem_classes="main-header"):
323
+ gr.Markdown(
324
+ f"""
325
+ # {UILabels.MAIN_TITLE}
326
+ {UILabels.MAIN_SUBTITLE}
327
+ {UILabels.MAIN_DESCRIPTION}
328
+ """,
329
+ elem_classes="header-content"
330
+ )
331
+
332
+ # Processing mode section (SIMPLIFIร‰ - seulement Transformers 8-bit)
333
+ with gr.Column(elem_classes="processing-section"):
334
+ gr.Markdown("## ๐Ÿ”ง Processing Configuration")
335
+ gr.Markdown("*HF Spaces version - Transformers backend with 8-bit quantization*")
336
+
337
+ # Model selection (seulement les modรจles 8-bit)
338
+ with gr.Row():
339
+ with gr.Column():
340
+ local_model_choice = gr.Radio(
341
+ choices=[UILabels.MODEL_MINI, UILabels.MODEL_SMALL],
342
+ value=UILabels.MODEL_MINI,
343
+ label="Model Selection"
344
+ )
345
+
346
+ with gr.Column():
347
+ local_precision_choice = gr.Radio(
348
+ choices=[UILabels.PRECISION_8BIT],
349
+ value=UILabels.PRECISION_8BIT,
350
+ label="Precision (Fixed for HF Spaces)"
351
+ )
352
+
353
+ # Input mode selection (identique ร  l'original)
354
+ with gr.Column(elem_classes="processing-section"):
355
+ gr.Markdown(UILabels.INPUT_MODE_TITLE)
356
+
357
+ input_mode = gr.Radio(
358
+ choices=[UILabels.INPUT_MODE_AUDIO, UILabels.INPUT_MODE_VIDEO],
359
+ value=UILabels.INPUT_MODE_AUDIO,
360
+ label=UILabels.INPUT_MODE_LABEL
361
+ )
362
+
363
+ # Section Audio (mode par dรฉfaut) - identique ร  l'original
364
+ with gr.Column(elem_classes="processing-section") as audio_section:
365
+ gr.Markdown(UILabels.AUDIO_MODE_TITLE)
366
+
367
+ audio_input = gr.Audio(
368
+ label=UILabels.AUDIO_INPUT_LABEL,
369
+ type="filepath",
370
+ show_label=True,
371
+ interactive=True
372
+ )
373
+
374
+ # Section Vidรฉo (cachรฉe par dรฉfaut) - identique ร  l'original
375
+ with gr.Column(elem_classes="processing-section", visible=False) as video_section:
376
+ gr.Markdown(UILabels.VIDEO_MODE_TITLE)
377
+
378
+ video_input = gr.File(
379
+ label=UILabels.VIDEO_INPUT_LABEL,
380
+ file_types=["video"]
381
+ )
382
+
383
+ btn_extract_audio = gr.Button(
384
+ UILabels.EXTRACT_AUDIO_BUTTON,
385
+ variant="secondary",
386
+ size="lg"
387
+ )
388
+
389
+ # Section options de trim (identique ร  l'original)
390
+ with gr.Column(elem_classes="processing-section"):
391
+ with gr.Accordion(UILabels.TRIM_OPTIONS_TITLE, open=False):
392
+ with gr.Row():
393
+ start_trim_input = gr.Number(
394
+ label=UILabels.START_TRIM_LABEL,
395
+ value=0,
396
+ minimum=0,
397
+ maximum=3600
398
+ )
399
+ end_trim_input = gr.Number(
400
+ label=UILabels.END_TRIM_LABEL,
401
+ value=0,
402
+ minimum=0,
403
+ maximum=3600
404
+ )
405
+
406
+ # Section diarisation (identique ร  l'original)
407
+ with gr.Column(elem_classes="processing-section"):
408
+ with gr.Accordion(UILabels.DIARIZATION_TITLE, open=False):
409
+ gr.Markdown(UILabels.DIARIZATION_DESCRIPTION)
410
+
411
+ with gr.Row():
412
+ num_speakers_input = gr.Number(
413
+ label=UILabels.NUM_SPEAKERS_LABEL,
414
+ value=None,
415
+ minimum=1,
416
+ maximum=10,
417
+ placeholder=UILabels.NUM_SPEAKERS_PLACEHOLDER
418
+ )
419
+
420
+ btn_diarize = gr.Button(
421
+ UILabels.DIARIZE_BUTTON,
422
+ variant="secondary",
423
+ size="lg"
424
+ )
425
+
426
+ # Section segments de rรฉfรฉrence
427
+ gr.Markdown(UILabels.REFERENCE_SEGMENTS_TITLE)
428
+ gr.Markdown(UILabels.REFERENCE_SEGMENTS_DESCRIPTION)
429
+
430
+ speaker_buttons = gr.Radio(
431
+ label=UILabels.SPEAKERS_DETECTED_LABEL,
432
+ choices=[],
433
+ visible=False
434
+ )
435
+
436
+ reference_audio_player = gr.Audio(
437
+ label=UILabels.REFERENCE_AUDIO_LABEL,
438
+ type="filepath",
439
+ interactive=False,
440
+ visible=True
441
+ )
442
+
443
+ # Section renommage des locuteurs
444
+ with gr.Column(visible=False) as rename_section:
445
+ gr.Markdown(UILabels.SPEAKER_RENAME_TITLE)
446
+
447
+ with gr.Row():
448
+ speaker_name_input = gr.Textbox(
449
+ label=UILabels.SPEAKER_NAME_LABEL,
450
+ placeholder=UILabels.SPEAKER_NAME_PLACEHOLDER
451
+ )
452
+
453
+ btn_apply_rename = gr.Button(
454
+ UILabels.APPLY_RENAME_BUTTON,
455
+ variant="primary",
456
+ size="sm"
457
+ )
458
+
459
+ renamed_speakers_output = gr.Textbox(
460
+ label=UILabels.IDENTIFIED_SPEAKERS_LABEL,
461
+ value="",
462
+ lines=5,
463
+ interactive=False,
464
+ visible=False
465
+ )
466
+
467
+ # Section d'analyse principale (identique ร  l'original)
468
+ with gr.Column(elem_classes="processing-section"):
469
+ gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
470
+ gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
471
+
472
+ # Contrรดle taille des chunks
473
+ chunk_duration_slider = gr.Slider(
474
+ minimum=5,
475
+ maximum=25,
476
+ value=15,
477
+ step=5,
478
+ label=UILabels.CHUNK_DURATION_LABEL
479
+ )
480
+
481
+ # Configuration des sections de rรฉsumรฉ
482
+ gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
483
+ gr.Markdown(UILabels.SUMMARY_SECTIONS_DESCRIPTION)
484
+
485
+ # Boutons de prรฉsรฉlection rapide
486
+ with gr.Row():
487
+ btn_preset_action = gr.Button(UILabels.PRESET_ACTION_BUTTON, variant="secondary", size="sm")
488
+ btn_preset_info = gr.Button(UILabels.PRESET_INFO_BUTTON, variant="secondary", size="sm")
489
+ btn_preset_complet = gr.Button(UILabels.PRESET_COMPLETE_BUTTON, variant="secondary", size="sm")
490
+
491
+ with gr.Row():
492
+ with gr.Column():
493
+ gr.Markdown(UILabels.ACTION_SECTIONS_TITLE)
494
+ section_resume_executif = gr.Checkbox(label=UILabels.SECTION_EXECUTIVE_SUMMARY, value=True)
495
+ section_discussions = gr.Checkbox(label=UILabels.SECTION_MAIN_DISCUSSIONS, value=True)
496
+ section_plan_action = gr.Checkbox(label=UILabels.SECTION_ACTION_PLAN, value=True)
497
+ section_decisions = gr.Checkbox(label=UILabels.SECTION_DECISIONS, value=True)
498
+ section_prochaines_etapes = gr.Checkbox(label=UILabels.SECTION_NEXT_STEPS, value=True)
499
+
500
+ with gr.Column():
501
+ gr.Markdown(UILabels.INFO_SECTIONS_TITLE)
502
+ section_sujets_principaux = gr.Checkbox(label=UILabels.SECTION_MAIN_TOPICS, value=False)
503
+ section_points_importants = gr.Checkbox(label=UILabels.SECTION_KEY_POINTS, value=False)
504
+ section_questions = gr.Checkbox(label=UILabels.SECTION_QUESTIONS, value=False)
505
+ section_elements_suivi = gr.Checkbox(label=UILabels.SECTION_FOLLOW_UP, value=False)
506
+
507
+ btn_direct_transcribe = gr.Button(
508
+ UILabels.ANALYZE_BUTTON,
509
+ variant="primary",
510
+ size="lg"
511
+ )
512
+
513
+ # Section rรฉsultats (identique ร  l'original)
514
+ with gr.Column(elem_classes="results-section"):
515
+ gr.Markdown(UILabels.RESULTS_TITLE)
516
+
517
+ final_summary_output = gr.Markdown(
518
+ value=UILabels.RESULTS_PLACEHOLDER,
519
+ label=UILabels.RESULTS_LABEL,
520
+ height=500
521
+ )
522
+
523
+ # Event handlers (adaptรฉs pour HF Spaces)
524
+
525
+ # Gestion du changement de mode d'entrรฉe
526
+ input_mode.change(
527
+ fn=handle_input_mode_change,
528
+ inputs=[input_mode],
529
+ outputs=[audio_section, video_section]
530
+ )
531
+
532
+ # Extraction audio depuis vidรฉo
533
+ btn_extract_audio.click(
534
+ fn=extract_audio_from_video,
535
+ inputs=[video_input, gr.State("french")],
536
+ outputs=[audio_input, audio_section, video_section, input_mode, gr.State("french")]
537
+ )
538
+
539
+ # Fonctions de prรฉsรฉlection des sections (identiques ร  l'original)
540
+ def preset_action():
541
+ return (True, True, True, True, True, False, False, False, False)
542
+
543
+ def preset_info():
544
+ return (True, False, False, False, False, True, True, True, True)
545
+
546
+ def preset_complet():
547
+ return (True, True, True, True, True, True, True, True, True)
548
+
549
+ # Gestion de l'analyse directe (adaptรฉe pour Transformers uniquement)
550
+ def handle_analysis_direct(
551
+ audio_file, hf_token, language, local_model, local_precision, start_trim, end_trim, chunk_duration,
552
+ s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
553
+ s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
554
+ ):
555
+ # Mode Transformers uniquement
556
+ transcription_mode = f"Transformers ({local_model} ({local_precision}))"
557
+ model_key = local_model
558
+
559
+ # Construire la liste des sections sรฉlectionnรฉes
560
+ sections_checkboxes = [
561
+ (s_resume, "resume_executif"),
562
+ (s_discussions, "discussions_principales"),
563
+ (s_plan_action, "plan_action"),
564
+ (s_decisions, "decisions_prises"),
565
+ (s_prochaines_etapes, "prochaines_etapes"),
566
+ (s_sujets_principaux, "sujets_principaux"),
567
+ (s_points_importants, "points_importants"),
568
+ (s_questions, "questions_discussions"),
569
+ (s_elements_suivi, "elements_suivi")
570
+ ]
571
+
572
+ selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
573
+
574
+ # Appeler la fonction d'analyse directe
575
+ _, summary = handle_direct_transcription(
576
+ audio_file, hf_token, language, transcription_mode,
577
+ model_key, selected_sections, current_diarization_context, start_trim, end_trim, chunk_duration
578
+ )
579
+ return summary
580
+
581
+ # ร‰vรฉnements de prรฉsรฉlection (identiques ร  l'original)
582
+ btn_preset_action.click(
583
+ fn=preset_action,
584
+ outputs=[
585
+ section_resume_executif, section_discussions, section_plan_action,
586
+ section_decisions, section_prochaines_etapes, section_sujets_principaux,
587
+ section_points_importants, section_questions, section_elements_suivi
588
+ ]
589
+ )
590
+
591
+ btn_preset_info.click(
592
+ fn=preset_info,
593
+ outputs=[
594
+ section_resume_executif, section_discussions, section_plan_action,
595
+ section_decisions, section_prochaines_etapes, section_sujets_principaux,
596
+ section_points_importants, section_questions, section_elements_suivi
597
+ ]
598
+ )
599
+
600
+ btn_preset_complet.click(
601
+ fn=preset_complet,
602
+ outputs=[
603
+ section_resume_executif, section_discussions, section_plan_action,
604
+ section_decisions, section_prochaines_etapes, section_sujets_principaux,
605
+ section_points_importants, section_questions, section_elements_suivi
606
+ ]
607
+ )
608
+
609
+ # Analyse principale (adaptรฉe pour HF Spaces)
610
+ btn_direct_transcribe.click(
611
+ fn=handle_analysis_direct,
612
+ inputs=[
613
+ audio_input,
614
+ gr.State(value=hf_token),
615
+ gr.State("french"),
616
+ local_model_choice,
617
+ local_precision_choice,
618
+ start_trim_input,
619
+ end_trim_input,
620
+ chunk_duration_slider,
621
+ section_resume_executif,
622
+ section_discussions,
623
+ section_plan_action,
624
+ section_decisions,
625
+ section_prochaines_etapes,
626
+ section_sujets_principaux,
627
+ section_points_importants,
628
+ section_questions,
629
+ section_elements_suivi
630
+ ],
631
+ outputs=[final_summary_output]
632
+ )
633
+
634
+ # Gestion de la diarisation (adaptรฉe pour HF Spaces)
635
+ btn_diarize.click(
636
+ fn=handle_diarization,
637
+ inputs=[audio_input, gr.State(value=hf_token), num_speakers_input, start_trim_input, end_trim_input],
638
+ outputs=[speaker_buttons, reference_audio_player, rename_section]
639
+ )
640
+
641
+ # Gestion de la sรฉlection de locuteur
642
+ speaker_buttons.change(
643
+ fn=handle_speaker_selection,
644
+ inputs=[speaker_buttons, speaker_name_input],
645
+ outputs=[reference_audio_player, speaker_name_input]
646
+ )
647
+
648
+ # Gestion du renommage
649
+ btn_apply_rename.click(
650
+ fn=handle_speaker_rename,
651
+ inputs=[speaker_name_input],
652
+ outputs=[renamed_speakers_output, renamed_speakers_output]
653
+ )
654
+
655
+ # Footer (identique ร  l'original)
656
+ with gr.Row():
657
+ gr.Markdown(
658
+ """
659
+ ---
660
+ **MeetingNotes** | Powered by [Voxtral](https://mistral.ai/) |
661
+ ๐Ÿš€ Intelligent meeting analysis | ๐Ÿ’พ HF Spaces with Zero GPU
662
+ """,
663
+ elem_classes="footer-info"
664
+ )
665
+
666
+ return demo
src/utils/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Utilities for HF Spaces version."""
2
+
3
+ from .zero_gpu_manager import ZeroGPUManager, gpu_inference, gpu_model_loading, gpu_long_task
4
+ from .token_tracker import TokenTracker
5
+
6
+ __all__ = ['ZeroGPUManager', 'gpu_inference', 'gpu_model_loading', 'gpu_long_task', 'TokenTracker']
src/utils/token_tracker.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Token usage tracking utility for MeetingNotes HF Spaces.
3
+
4
+ This module provides a centralized way to track and report token consumption
5
+ for Transformers-based processing in HF Spaces environment.
6
+ """
7
+
8
+
9
+ class TokenTracker:
10
+ """
11
+ Centralized token usage tracking for HF Spaces.
12
+
13
+ Tracks input and output tokens across different chunks and processing modes
14
+ to provide comprehensive usage statistics.
15
+ """
16
+
17
+ def __init__(self, mode: str = "Transformers-8bit"):
18
+ self.mode = mode
19
+ self.reset()
20
+
21
+ def reset(self):
22
+ """Reset all counters."""
23
+ self.chunks_processed = 0
24
+ self.total_input_tokens = 0
25
+ self.total_output_tokens = 0
26
+ self.synthesis_input_tokens = 0
27
+ self.synthesis_output_tokens = 0
28
+
29
+ def set_mode(self, mode: str):
30
+ """Set the processing mode for reporting."""
31
+ self.mode = mode
32
+
33
+ def add_chunk_tokens(self, input_tokens: int, output_tokens: int):
34
+ """Add tokens from a chunk processing."""
35
+ self.chunks_processed += 1
36
+ self.total_input_tokens += input_tokens
37
+ self.total_output_tokens += output_tokens
38
+
39
+ print(f"๐Ÿ“Š Stats {self.mode} Chunk {self.chunks_processed} - Input: {input_tokens} tokens, Output: {output_tokens} tokens")
40
+
41
+ def add_synthesis_tokens(self, input_tokens: int, output_tokens: int):
42
+ """Add tokens from synthesis processing."""
43
+ self.synthesis_input_tokens = input_tokens
44
+ self.synthesis_output_tokens = output_tokens
45
+
46
+ print(f"๐Ÿ“Š Stats {self.mode} Synthesis - Input: {input_tokens} tokens, Output: {output_tokens} tokens")
47
+
48
+ def print_summary(self):
49
+ """Print final token usage summary."""
50
+ total_input = self.total_input_tokens + self.synthesis_input_tokens
51
+ total_output = self.total_output_tokens + self.synthesis_output_tokens
52
+ grand_total = total_input + total_output
53
+
54
+ print(f"\n๐Ÿ“Š === TOKEN USAGE SUMMARY ({self.mode}) ===")
55
+ print(f"๐Ÿ“ฆ Chunks processed: {self.chunks_processed}")
56
+ print(f"๐Ÿ“ฅ Total input tokens: {total_input:,}")
57
+ print(f"๐Ÿ“ค Total output tokens: {total_output:,}")
58
+ print(f"๐Ÿ”ข Grand total: {grand_total:,} tokens")
59
+
60
+ if self.synthesis_input_tokens > 0:
61
+ print(f" โ€ข Chunk analysis: {self.total_input_tokens + self.total_output_tokens:,} tokens")
62
+ print(f" โ€ข Final synthesis: {self.synthesis_input_tokens + self.synthesis_output_tokens:,} tokens")
63
+
64
+ print("=" * 50)
src/utils/zero_gpu_manager.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Zero GPU management for Hugging Face Spaces.
3
+
4
+ This module provides decorators and utilities for efficient GPU usage
5
+ in HF Spaces environment with automatic resource management.
6
+ """
7
+
8
+ import functools
9
+ import gc
10
+ import os
11
+ import torch
12
+ from typing import Callable, Any
13
+
14
+ # Import spaces if available (HF Spaces environment)
15
+ try:
16
+ import spaces
17
+ except ImportError:
18
+ spaces = None
19
+
20
+
21
+ class ZeroGPUManager:
22
+ """Manager for Zero GPU operations in HF Spaces."""
23
+
24
+ def __init__(self):
25
+ # Device selection with MPS support for local Mac testing
26
+ if torch.backends.mps.is_available():
27
+ self.device = "mps"
28
+ self.dtype = torch.float16 # MPS works better with float16
29
+ print("๐Ÿš€ Using MPS (Apple Silicon) for local testing")
30
+ elif torch.cuda.is_available():
31
+ self.device = "cuda"
32
+ self.dtype = torch.bfloat16 # CUDA supports bfloat16
33
+ print("๐Ÿš€ Using CUDA GPU")
34
+ else:
35
+ self.device = "cpu"
36
+ self.dtype = torch.float16 # CPU with float16 to save memory
37
+ print("โš ๏ธ Using CPU")
38
+
39
+ self.is_spaces = os.getenv("SPACE_ID") is not None
40
+
41
+ @staticmethod
42
+ def gpu_task(duration: int = 60):
43
+ """
44
+ Decorator for GPU-intensive tasks.
45
+
46
+ Args:
47
+ duration: Expected duration in seconds for GPU allocation
48
+ """
49
+ def decorator(func: Callable) -> Callable:
50
+ if spaces is not None and hasattr(spaces, 'GPU'):
51
+ # Use HF Spaces GPU decorator
52
+ return spaces.GPU(duration=duration)(func)
53
+ else:
54
+ # Fallback for local development
55
+ return func
56
+ return decorator
57
+
58
+ @staticmethod
59
+ def cleanup_gpu():
60
+ """Clean up GPU memory after processing (CUDA/MPS/CPU)."""
61
+ if torch.backends.mps.is_available():
62
+ torch.mps.empty_cache()
63
+ elif torch.cuda.is_available():
64
+ torch.cuda.empty_cache()
65
+ gc.collect()
66
+
67
+ def get_device(self) -> str:
68
+ """Get the appropriate device for processing."""
69
+ return self.device
70
+
71
+ def is_gpu_available(self) -> bool:
72
+ """Check if GPU (CUDA or MPS) is available."""
73
+ return torch.cuda.is_available() or torch.backends.mps.is_available()
74
+
75
+ def is_spaces_environment(self) -> bool:
76
+ """Check if running in HF Spaces environment."""
77
+ return self.is_spaces
78
+
79
+ def get_memory_info(self) -> dict:
80
+ """Get current GPU memory information (CUDA or MPS)."""
81
+ if torch.cuda.is_available():
82
+ return {
83
+ "available": True,
84
+ "device": "cuda",
85
+ "allocated": torch.cuda.memory_allocated(),
86
+ "cached": torch.cuda.memory_reserved(),
87
+ "total": torch.cuda.get_device_properties(0).total_memory
88
+ }
89
+ elif torch.backends.mps.is_available():
90
+ return {
91
+ "available": True,
92
+ "device": "mps",
93
+ "allocated": torch.mps.current_allocated_memory(),
94
+ "driver_allocated": torch.mps.driver_allocated_memory(),
95
+ # MPS doesn't have total memory info readily available
96
+ "total": "N/A (MPS)"
97
+ }
98
+ else:
99
+ return {"available": False, "device": "cpu"}
100
+
101
+
102
+ # Convenience decorators
103
+ def gpu_inference(duration: int = 60):
104
+ """Decorator for GPU inference tasks."""
105
+ return ZeroGPUManager.gpu_task(duration=duration)
106
+
107
+
108
+ def gpu_model_loading(duration: int = 120):
109
+ """Decorator for GPU model loading tasks."""
110
+ return ZeroGPUManager.gpu_task(duration=duration)
111
+
112
+
113
+ def gpu_long_task(duration: int = 300):
114
+ """Decorator for long GPU processing tasks."""
115
+ return ZeroGPUManager.gpu_task(duration=duration)