Spaces:

VincentGOURBIN
/

MeetingNotes-Voxtral-Analysis

Running on Zero

App Files Files Community

VincentGOURBIN commited on Aug 1

Commit

48397c5

verified ·

1 Parent(s): 49d38f7

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

src/__init__.py +1 -0
src/ai/__init__.py +7 -0
src/ai/diarization.py +338 -0
src/ai/prompts_config.py +204 -0
src/ai/voxtral_spaces_analyzer.py +398 -0
src/ui/__init__.py +1 -0
src/ui/spaces_interface.py +666 -0
src/utils/__init__.py +6 -0
src/utils/token_tracker.py +64 -0
src/utils/zero_gpu_manager.py +115 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """MeetingNotes Hugging Face Spaces package."""

src/ai/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""AI modules for HF Spaces version."""
+from .voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
+from .diarization import SpeakerDiarization
+from .prompts_config import VoxtralPrompts
+__all__ = ['VoxtralSpacesAnalyzer', 'SpeakerDiarization', 'VoxtralPrompts']

src/ai/diarization.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Speaker diarization module for HF Spaces with Zero GPU support.
+This module uses pyannote/speaker-diarization-3.1 to identify
+and segment different speakers in an audio file, optimized for HF Spaces.
+"""
+import torch
+import torchaudio
+from pyannote.audio import Pipeline
+from typing import Optional, Dict, Any, List, Tuple
+import tempfile
+import os
+from pydub import AudioSegment
+import time
+from ..utils.zero_gpu_manager import gpu_model_loading, gpu_inference, ZeroGPUManager
+class SpeakerDiarization:
+    """
+    Speaker diarization using pyannote/speaker-diarization-3.1 for HF Spaces.
+    This class handles automatic speaker diarization
+    with Zero GPU decorators for efficient compute allocation.
+    """
+    def __init__(self, hf_token: str = None):
+        """
+        Initialize the pyannote diarizer for HF Spaces.
+        Args:
+            hf_token (str): Hugging Face token to access the model
+        """
+        self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        self.pipeline = None
+        self.gpu_manager = ZeroGPUManager()
+        print("🔄 Initializing pyannote diarizer for HF Spaces...")
+    @gpu_model_loading(duration=90)
+    def _load_pipeline(self):
+        """Load diarization pipeline with GPU allocation if not already loaded."""
+        if self.pipeline is None:
+            print("📥 Loading pyannote/speaker-diarization-3.1 model...")
+            self.pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=self.hf_token
+            )
+            # Use GPU if available (CUDA or MPS)
+            if self.gpu_manager.is_gpu_available():
+                device = self.gpu_manager.get_device()
+                if device == "mps":
+                    # MPS support for local Mac testing
+                    self.pipeline = self.pipeline.to(torch.device("mps"))
+                    print("🚀 Pyannote pipeline loaded on MPS (Apple Silicon)")
+                elif device == "cuda":
+                    self.pipeline = self.pipeline.to(torch.device("cuda"))
+                    print("🚀 Pyannote pipeline loaded on CUDA")
+                else:
+                    print("⚠️ Pyannote pipeline loaded on CPU")
+            else:
+                print("⚠️ Pyannote pipeline loaded on CPU")
+    @gpu_inference(duration=180)
+    def diarize_audio(self, audio_path: str, num_speakers: Optional[int] = None) -> Tuple[str, List[Dict]]:
+        """
+        Perform speaker diarization on an audio file with Zero GPU.
+        Args:
+            audio_path (str): Path to the audio file
+            num_speakers (Optional[int]): Expected number of speakers (optional)
+        Returns:
+            Tuple[str, List[Dict]]: (RTTM result, List of reference segments for each speaker)
+        """
+        try:
+            # Load pipeline if necessary
+            self._load_pipeline()
+            print(f"🎤 Starting diarization: {audio_path}")
+            # Prepare audio file for pyannote (mono WAV)
+            processed_audio_path = self._prepare_audio_for_pyannote(audio_path)
+            # Diarization parameters
+            diarization_params = {}
+            if num_speakers is not None:
+                diarization_params["num_speakers"] = num_speakers
+                print(f"👥 Specified number of speakers: {num_speakers}")
+            # Perform diarization
+            print("🔍 Speaker analysis in progress...")
+            diarization = self.pipeline(processed_audio_path, **diarization_params)
+            # Convert to RTTM format
+            rttm_output = self._convert_to_rttm(diarization, audio_path)
+            # Extract reference segments (first long segments for each speaker)
+            try:
+                reference_segments = self._extract_reference_segments(diarization, audio_path, min_duration=5.0)
+            except Exception as ref_error:
+                print(f"⚠️ Error extracting reference segments: {ref_error}")
+                reference_segments = []
+            print(f"✅ Diarization completed: {len(diarization)} segments detected")
+            print(f"🎤 Reference segments created: {len(reference_segments)} speakers")
+            # Clean up temporary file if created
+            if processed_audio_path != audio_path:
+                try:
+                    os.unlink(processed_audio_path)
+                except:
+                    pass
+            return rttm_output, reference_segments
+        except Exception as e:
+            print(f"❌ Error during diarization: {e}")
+            return f"❌ Error during diarization: {str(e)}", []
+        finally:
+            # Clean up GPU memory
+            self.gpu_manager.cleanup_gpu()
+    def _prepare_audio_for_pyannote(self, audio_path: str) -> str:
+        """
+        Prepare audio file for pyannote (mono WAV if necessary).
+        Args:
+            audio_path (str): Path to original audio file
+        Returns:
+            str: Path to prepared audio file
+        """
+        try:
+            # Load audio with pydub to check format
+            audio = AudioSegment.from_file(audio_path)
+            # Check if conversion is needed (mono + WAV)
+            needs_conversion = (
+                audio.channels != 1 or  # Not mono
+                not audio_path.lower().endswith('.wav')  # Not WAV
+            )
+            if not needs_conversion:
+                print("🎵 Audio already in correct format for pyannote")
+                return audio_path
+            print("🔄 Converting audio for pyannote (mono WAV)...")
+            # Convert to mono WAV
+            mono_audio = audio.set_channels(1)
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                temp_path = tmp_file.name
+            # Export as mono WAV
+            mono_audio.export(temp_path, format="wav")
+            print(f"✅ Audio converted: {temp_path}")
+            return temp_path
+        except Exception as e:
+            print(f"⚠️ Audio conversion error: {e}, using original file")
+            return audio_path
+    def _convert_to_rttm(self, diarization, audio_file: str) -> str:
+        """
+        Convert diarization result to RTTM format.
+        Args:
+            diarization: Pyannote diarization object
+            audio_file (str): Audio filename for RTTM
+        Returns:
+            str: RTTM format content
+        """
+        rttm_lines = []
+        # RTTM header
+        audio_filename = os.path.basename(audio_file)
+        for segment, _, speaker in diarization.itertracks(yield_label=True):
+            # RTTM format: SPEAKER file 1 start_time duration <NA> <NA> speaker_id <NA> <NA>
+            start_time = segment.start
+            duration = segment.duration
+            rttm_line = f"SPEAKER {audio_filename} 1 {start_time:.3f} {duration:.3f} <NA> <NA> {speaker} <NA> <NA>"
+            rttm_lines.append(rttm_line)
+        return "\n".join(rttm_lines)
+    def _extract_reference_segments(self, diarization, audio_path: str, min_duration: float = 5.0) -> List[Dict]:
+        """
+        Extract first long segment for each speaker as reference.
+        Args:
+            diarization: Pyannote diarization object
+            audio_path (str): Path to audio file
+            min_duration (float): Minimum duration in seconds for a reference segment
+        Returns:
+            List[Dict]: List of reference segments with metadata
+        """
+        reference_segments = []
+        speakers_found = set()
+        print(f"🔍 Searching for reference segments (>{min_duration}s) for each speaker...")
+        # Iterate through all segments to find first long segment of each speaker
+        try:
+            for segment, _, speaker in diarization.itertracks(yield_label=True):
+                if speaker not in speakers_found and segment.duration >= min_duration:
+                    print(f"👤 {speaker}: {segment.duration:.1f}s segment found ({segment.start:.1f}s-{segment.end:.1f}s)")
+                    # Create audio snippet
+                    snippet_path = self._create_audio_snippet(
+                        audio_path,
+                        segment.start,
+                        segment.end,
+                        speaker
+                    )
+                    if snippet_path:
+                        reference_segments.append({
+                            'speaker': speaker,
+                            'start': segment.start,
+                            'end': segment.end,
+                            'duration': segment.duration,
+                            'audio_path': snippet_path
+                        })
+                        speakers_found.add(speaker)
+            # Fallback: if no long segments found for some speakers, take the longest
+            all_speakers_in_diarization = set(speaker for _, _, speaker in diarization.itertracks(yield_label=True))
+            if len(speakers_found) < len(all_speakers_in_diarization):
+                print("⚠️ Some speakers don't have long segments, using longest segments...")
+                self._add_fallback_segments(diarization, audio_path, reference_segments, speakers_found, min_duration)
+        except Exception as iter_error:
+            print(f"❌ Error iterating segments: {iter_error}")
+            reference_segments = []
+        return reference_segments
+    def _add_fallback_segments(self, diarization, audio_path: str, reference_segments: List[Dict],
+                              speakers_found: set, min_duration: float):
+        """Add fallback segments for speakers without long segments."""
+        all_speakers = set(speaker for _, _, speaker in diarization.itertracks(yield_label=True))
+        missing_speakers = all_speakers - speakers_found
+        for speaker in missing_speakers:
+            # Find longest segment for this speaker
+            longest_segment = None
+            longest_duration = 0
+            for segment, _, spk in diarization.itertracks(yield_label=True):
+                if spk == speaker and segment.duration > longest_duration:
+                    longest_segment = segment
+                    longest_duration = segment.duration
+            if longest_segment and longest_duration > 1.0:  # At least 1 second
+                print(f"👤 {speaker}: fallback segment of {longest_duration:.1f}s")
+                snippet_path = self._create_audio_snippet(
+                    audio_path,
+                    longest_segment.start,
+                    longest_segment.end,
+                    speaker
+                )
+                if snippet_path:
+                    reference_segments.append({
+                        'speaker': speaker,
+                        'start': longest_segment.start,
+                        'end': longest_segment.end,
+                        'duration': longest_duration,
+                        'audio_path': snippet_path
+                    })
+    def _create_audio_snippet(self, audio_path: str, start_time: float, end_time: float, speaker: str) -> Optional[str]:
+        """
+        Create temporary audio snippet for a speaker segment.
+        Args:
+            audio_path (str): Path to source audio file
+            start_time (float): Start in seconds
+            end_time (float): End in seconds
+            speaker (str): Speaker ID
+        Returns:
+            Optional[str]: Path to created temporary audio snippet or None if error
+        """
+        try:
+            # Load audio
+            audio = AudioSegment.from_file(audio_path)
+            # Convert to milliseconds
+            start_ms = int(start_time * 1000)
+            end_ms = int(end_time * 1000)
+            # Extract segment
+            segment = audio[start_ms:end_ms]
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(
+                suffix=f"_{speaker}_{start_time:.1f}s.wav",
+                delete=False
+            ) as tmp_file:
+                snippet_path = tmp_file.name
+            # Export snippet to temporary file
+            segment.export(snippet_path, format="wav")
+            print(f"🎵 Temporary snippet created: {snippet_path}")
+            return snippet_path
+        except Exception as e:
+            print(f"❌ Error creating snippet for {speaker}: {e}")
+            return None
+    def cleanup(self):
+        """Release pipeline resources."""
+        if self.pipeline is not None:
+            # Free GPU/MPS memory by moving to CPU
+            if hasattr(self.pipeline, 'to'):
+                try:
+                    self.pipeline = self.pipeline.to(torch.device('cpu'))
+                except Exception as e:
+                    print(f"⚠️ Error moving to CPU: {e}")
+            del self.pipeline
+            self.pipeline = None
+            # Clean up memory
+            self.gpu_manager.cleanup_gpu()
+            print("🧹 Pyannote pipeline freed from memory")

src/ai/prompts_config.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Centralized prompts configuration for Voxtral in HF Spaces.
+This module contains all prompts used by Voxtral analyzers
+for different types of analyses and processing modes.
+"""
+class VoxtralPrompts:
+    """Class containing all system prompts for Voxtral."""
+    # ====================================
+    # AVAILABLE SECTIONS FOR SUMMARIES
+    # Note: Titles are in English but the AI will adapt language based on meeting content
+    # ====================================
+    AVAILABLE_SECTIONS = {
+        "resume_executif": {
+            "title": "## EXECUTIVE SUMMARY",
+            "description": "Overview of the purpose of this meeting segment and its outcomes",
+            "default_action": True,
+            "default_info": True
+        },
+        "discussions_principales": {
+            "title": "## MAIN DISCUSSIONS",
+            "description": "Main topics addressed and important points raised",
+            "default_action": True,
+            "default_info": False
+        },
+        "sujets_principaux": {
+            "title": "## MAIN TOPICS",
+            "description": "Key topics discussed and information presented",
+            "default_action": False,
+            "default_info": True
+        },
+        "plan_action": {
+            "title": "## ACTION PLAN",
+            "description": "Complete list of actions with:\n- Specific tasks and deliverables\n- Assigned responsibilities\n- Deadlines and timelines\n- Priority levels",
+            "default_action": True,
+            "default_info": False
+        },
+        "decisions_prises": {
+            "title": "## DECISIONS MADE",
+            "description": "All decisions made during this segment",
+            "default_action": True,
+            "default_info": False
+        },
+        "points_importants": {
+            "title": "## KEY POINTS",
+            "description": "Important discoveries, data or insights shared",
+            "default_action": False,
+            "default_info": True
+        },
+        "questions_discussions": {
+            "title": "## QUESTIONS & DISCUSSIONS",
+            "description": "Main questions asked and discussions held",
+            "default_action": False,
+            "default_info": True
+        },
+        "prochaines_etapes": {
+            "title": "## NEXT STEPS",
+            "description": "Follow-up actions and planned future meetings",
+            "default_action": True,
+            "default_info": False
+        },
+        "elements_suivi": {
+            "title": "## FOLLOW-UP ELEMENTS",
+            "description": "Follow-up information or clarifications needed",
+            "default_action": False,
+            "default_info": True
+        }
+    }
+    @staticmethod
+    def get_meeting_summary_prompt(selected_sections: list, speaker_references: str = "", chunk_info: str = "", previous_context: str = "") -> str:
+        """
+        Generate meeting summary prompt according to selected sections.
+        Args:
+            selected_sections (list): List of section keys to include
+            speaker_references (str): Diarization context with tags (optional)
+            chunk_info (str): Audio segment information (optional)
+            previous_context (str): Context from previous segments (optional)
+        Returns:
+            str: Formatted prompt
+        """
+        # Diarization context
+        diarization_context = ""
+        if speaker_references and speaker_references.strip():
+            diarization_context = f"""
+CONTEXT FOR YOUR ANALYSIS (do not include in your response):
+Different speakers have been automatically identified in the audio: {speaker_references}
+Use this information to enrich your analysis but do not display it in your final response.
+"""
+        # Previous segments context
+        previous_summary_context = ""
+        if previous_context and previous_context.strip():
+            previous_summary_context = f"""
+CONTEXT FROM PREVIOUS SEGMENTS (do not include in your response):
+Here's what happened in previous audio segments:
+{previous_context}
+Use this information to ensure continuity and avoid repetitions, but focus on the new content of this segment.
+"""
+        # Audio segment information
+        segment_context = ""
+        if chunk_info and chunk_info.strip():
+            segment_context = f"""
+IMPORTANT: You are analyzing a segment ({chunk_info}) extracted from a longer audio recording.
+This segment may start or end in the middle of sentences/discussions.
+Focus on the content of this segment while keeping in mind it's part of a larger whole.
+"""
+        # Build selected sections
+        sections_text = ""
+        for section_key in selected_sections:
+            if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
+                section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
+                sections_text += f"\n{section['title']}\n{section['description']}\n"
+                print(f"✅ Section added: {section['title']}")
+            else:
+                print(f"❌ Unknown section: {section_key}")
+        return f"""Listen carefully to this meeting audio segment and provide a complete structured summary.{diarization_context}{previous_summary_context}{segment_context}
+CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
+- DETECT the language spoken in this audio
+- RESPOND OBLIGATORILY in the same detected language
+- If audio is in French → respond in French
+- If audio is in English → respond in English
+- If audio is in another language → respond in that language
+- NEVER use a different language than the one detected in the audio
+{sections_text}
+Format your response in markdown exactly as shown above."""
+    @staticmethod
+    def get_default_sections(meeting_type: str) -> list:
+        """
+        Return default sections according to meeting type.
+        Args:
+            meeting_type (str): "action" or "information"
+        Returns:
+            list: List of default section keys
+        """
+        if "action" in meeting_type.lower():
+            return [key for key, section in VoxtralPrompts.AVAILABLE_SECTIONS.items()
+                   if section["default_action"]]
+        else:
+            return [key for key, section in VoxtralPrompts.AVAILABLE_SECTIONS.items()
+                   if section["default_info"]]
+    @staticmethod
+    def get_synthesis_prompt(selected_sections: list, chunk_summaries: list) -> str:
+        """
+        Generate prompt for synthesizing multiple chunk summaries.
+        Args:
+            selected_sections (list): List of requested section keys
+            chunk_summaries (list): List of chunk summaries to synthesize
+        Returns:
+            str: Formatted synthesis prompt
+        """
+        # Build selected sections
+        sections_text = ""
+        for section_key in selected_sections:
+            if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
+                section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
+                sections_text += f"\n{section['title']}\n{section['description']}\n"
+        # Assemble all chunk summaries
+        all_chunks_text = "\n\n=== SEGMENT SEPARATOR ===\n\n".join(chunk_summaries)
+        return f"""You will receive multiple analyses of segments from the same audio meeting.
+Your role is to synthesize them into a coherent and structured global summary.
+SEGMENT ANALYSES TO SYNTHESIZE:
+{all_chunks_text}
+CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
+- DETECT the language used in the segments above
+- RESPOND OBLIGATORILY in the same detected language
+- If segments are in French → respond in French
+- If segments are in English → respond in English
+- Avoid repetitions between segments
+- Identify recurring elements and unify them
+- Ensure temporal and logical coherence
+- Produce a global summary that reflects the entire meeting
+Generate a final structured summary according to these sections:
+{sections_text}
+Format your response in markdown exactly as shown above."""

src/ai/voxtral_spaces_analyzer.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+Voxtral analyzer optimized for Hugging Face Spaces.
+This module provides audio analysis using Voxtral models with:
+- Only Transformers backend (no MLX or API)
+- Only 8-bit quantized models for memory efficiency
+- Zero GPU decorators for HF Spaces compute allocation
+- Optimized memory management for Spaces environment
+"""
+import torch
+import torchaudio
+import tempfile
+import time
+import gc
+import os
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+from pydub import AudioSegment
+from typing import List, Dict, Tuple, Optional
+from ..utils.zero_gpu_manager import gpu_model_loading, gpu_inference, gpu_long_task, ZeroGPUManager
+from .prompts_config import VoxtralPrompts
+from ..utils.token_tracker import TokenTracker
+class VoxtralSpacesAnalyzer:
+    """
+    Voxtral analyzer optimized for Hugging Face Spaces.
+    Features:
+    - Only 8-bit quantized models
+    - Zero GPU decorators for efficient compute allocation
+    - Memory-optimized processing for Spaces constraints
+    """
+    def __init__(self, model_name: str = "Voxtral-Mini-3B-2507"):
+        """
+        Initialize the Voxtral analyzer for HF Spaces.
+        Args:
+            model_name (str): Name of the Voxtral model to use (8-bit only)
+        """
+        # Only 8-bit models are supported in Spaces version
+        model_mapping = {
+            "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
+            "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
+        }
+        self.model_name = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
+        self.max_duration_minutes = 20  # Reduced for Spaces environment
+        self.gpu_manager = ZeroGPUManager()
+        # Model and processor will be loaded on-demand with GPU decorators
+        self.model = None
+        self.processor = None
+        self.token_tracker = TokenTracker("Transformers-8bit")
+        print(f"🚀 VoxtralSpacesAnalyzer initialized for model: {model_name}")
+    @gpu_model_loading(duration=120)
+    def _load_model_if_needed(self):
+        """Load model and processor with GPU allocation if not already loaded."""
+        if self.model is not None and self.processor is not None:
+            return
+        device = self.gpu_manager.get_device()
+        dtype = self.gpu_manager.dtype
+        print(f"🔄 Loading Voxtral model on {device} with {dtype}...")
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+        # Model loading strategy based on device and environment
+        if self.gpu_manager.is_spaces_environment() and device == "cuda":
+            # HF Spaces with CUDA: use 8-bit quantization
+            print("📦 Loading with 8-bit quantization for HF Spaces")
+            self.model = VoxtralForConditionalGeneration.from_pretrained(
+                self.model_name,
+                load_in_8bit=True,
+                device_map="auto",
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True
+            )
+        elif device == "mps":
+            # Local Mac with MPS: standard loading with MPS-compatible settings
+            print("📦 Loading with MPS optimization for local Mac testing")
+            self.model = VoxtralForConditionalGeneration.from_pretrained(
+                self.model_name,
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True
+            )
+            self.model = self.model.to(device)
+        elif device == "cuda":
+            # Local CUDA: can use more aggressive optimizations
+            print("📦 Loading with CUDA optimization for local testing")
+            self.model = VoxtralForConditionalGeneration.from_pretrained(
+                self.model_name,
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True,
+                device_map="auto"
+            )
+        else:
+            # CPU fallback
+            print("📦 Loading on CPU")
+            self.model = VoxtralForConditionalGeneration.from_pretrained(
+                self.model_name,
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True
+            )
+        print(f"✅ Model loaded successfully on {device}")
+        # Print memory info if available
+        if self.gpu_manager.is_gpu_available():
+            memory_info = self.gpu_manager.get_memory_info()
+            if memory_info["available"]:
+                if memory_info["device"] == "cuda":
+                    allocated_gb = memory_info["allocated"] / (1024**3)
+                    print(f"📊 CUDA Memory allocated: {allocated_gb:.2f}GB")
+                elif memory_info["device"] == "mps":
+                    allocated_mb = memory_info["allocated"] / (1024**2)
+                    print(f"📊 MPS Memory allocated: {allocated_mb:.1f}MB")
+    def _get_audio_duration(self, wav_path: str) -> float:
+        """Get audio duration in minutes."""
+        audio = AudioSegment.from_file(wav_path)
+        return len(audio) / (1000 * 60)
+    def _create_time_chunks(self, wav_path: str) -> List[Tuple[float, float]]:
+        """Create time-based chunks for processing."""
+        total_duration = self._get_audio_duration(wav_path) * 60  # seconds
+        max_chunk_seconds = self.max_duration_minutes * 60
+        if total_duration <= max_chunk_seconds:
+            return [(0, total_duration)]
+        chunks = []
+        current_start = 0
+        while current_start < total_duration:
+            chunk_end = min(current_start + max_chunk_seconds, total_duration)
+            chunks.append((current_start, chunk_end))
+            current_start = chunk_end
+        return chunks
+    def _extract_audio_chunk(self, wav_path: str, start_time: float, end_time: float) -> str:
+        """Extract audio chunk between timestamps."""
+        audio = AudioSegment.from_file(wav_path)
+        start_ms = int(start_time * 1000)
+        end_ms = int(end_time * 1000)
+        chunk = audio[start_ms:end_ms]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_chunk:
+            chunk_path = tmp_chunk.name
+        chunk.export(chunk_path, format="wav")
+        return chunk_path
+    @gpu_long_task(duration=300)
+    def analyze_audio_chunks(
+        self,
+        wav_path: str,
+        language: str = "french",
+        selected_sections: list = None,
+        chunk_duration_minutes: int = 15,
+        reference_speakers_data: str = None
+    ) -> Dict[str, str]:
+        """
+        Analyze audio by chunks using Voxtral with Zero GPU.
+        Args:
+            wav_path (str): Path to audio file
+            language (str): Expected language
+            selected_sections (list): Analysis sections to include
+            chunk_duration_minutes (int): Chunk duration in minutes
+            reference_speakers_data (str): Speaker diarization data
+        Returns:
+            Dict[str, str]: Analysis results
+        """
+        try:
+            # Ensure model is loaded
+            self._load_model_if_needed()
+            total_start_time = time.time()
+            duration = self._get_audio_duration(wav_path)
+            print(f"🎵 Audio duration: {duration:.1f} minutes")
+            # Create chunks
+            chunks = self._create_time_chunks(wav_path)
+            print(f"📦 Splitting into {len(chunks)} chunks")
+            chunk_summaries = []
+            for i, (start_time, end_time) in enumerate(chunks):
+                print(f"🎯 Processing chunk {i+1}/{len(chunks)} ({start_time/60:.1f}-{end_time/60:.1f}min)")
+                chunk_start_time = time.time()
+                chunk_path = self._extract_audio_chunk(wav_path, start_time, end_time)
+                try:
+                    # Analyze chunk with Zero GPU
+                    chunk_summary = self._analyze_single_chunk(
+                        chunk_path,
+                        selected_sections,
+                        reference_speakers_data,
+                        i + 1,
+                        len(chunks),
+                        start_time,
+                        end_time
+                    )
+                    chunk_summaries.append(f"## Segment {i+1} ({start_time/60:.1f}-{end_time/60:.1f}min)\n\n{chunk_summary}")
+                    chunk_duration = time.time() - chunk_start_time
+                    print(f"✅ Chunk {i+1} analyzed in {chunk_duration:.1f}s")
+                except Exception as e:
+                    print(f"❌ Error processing chunk {i+1}: {e}")
+                    chunk_summaries.append(f"**Segment {i+1}:** Processing error")
+                finally:
+                    # Clean up chunk file
+                    if os.path.exists(chunk_path):
+                        os.remove(chunk_path)
+                    # GPU cleanup after each chunk
+                    self.gpu_manager.cleanup_gpu()
+            # Final synthesis if multiple chunks
+            if len(chunk_summaries) > 1:
+                print(f"🔄 Final synthesis of {len(chunk_summaries)} segments...")
+                combined_content = "\n\n".join(chunk_summaries)
+                final_analysis = self._synthesize_chunks_final(combined_content, selected_sections)
+            else:
+                final_analysis = chunk_summaries[0] if chunk_summaries else "No analysis available."
+            total_duration = time.time() - total_start_time
+            print(f"⏱️ Total analysis completed in {total_duration:.1f}s for {duration:.1f}min of audio")
+            # Print token usage
+            self.token_tracker.print_summary()
+            return {"transcription": final_analysis}
+        finally:
+            # Final GPU cleanup
+            self.gpu_manager.cleanup_gpu()
+    @gpu_inference(duration=120)
+    def _analyze_single_chunk(
+        self,
+        chunk_path: str,
+        selected_sections: list,
+        reference_speakers_data: str,
+        chunk_num: int,
+        total_chunks: int,
+        start_time: float,
+        end_time: float
+    ) -> str:
+        """Analyze a single audio chunk with GPU inference."""
+        # Build analysis prompt
+        sections_list = selected_sections if selected_sections else ["resume_executif"]
+        chunk_info = f"SEGMENT {chunk_num}/{total_chunks} ({start_time/60:.1f}-{end_time/60:.1f}min)" if total_chunks > 1 else None
+        prompt_text = VoxtralPrompts.get_meeting_summary_prompt(
+            sections_list,
+            reference_speakers_data,
+            chunk_info,
+            None
+        )
+        # Create conversation for audio instruct mode
+        conversation = [{
+            "role": "user",
+            "content": [
+                {"type": "audio", "path": chunk_path},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        # Process with chat template
+        inputs = self.processor.apply_chat_template(conversation, return_tensors="pt")
+        device = self.gpu_manager.get_device()
+        dtype = self.gpu_manager.dtype if hasattr(self.gpu_manager, 'dtype') else torch.float16
+        # Move inputs to device with appropriate dtype
+        if hasattr(inputs, 'to'):
+            inputs = inputs.to(device, dtype=dtype)
+        else:
+            # Handle BatchFeature or dict-like inputs
+            inputs = {k: v.to(device, dtype=dtype) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Generate with optimized settings for Spaces
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=8000,  # Reduced for 8-bit model efficiency
+                temperature=0.2,
+                do_sample=True,
+                pad_token_id=self.processor.tokenizer.eos_token_id,
+                use_cache=True,
+                output_scores=False
+            )
+        # Decode response
+        input_tokens = inputs.input_ids.shape[1]
+        output_tokens_count = outputs.shape[1] - input_tokens
+        chunk_summary = self.processor.batch_decode(
+            outputs[:, inputs.input_ids.shape[1]:],
+            skip_special_tokens=True
+        )[0].strip()
+        # Track tokens
+        self.token_tracker.add_chunk_tokens(input_tokens, output_tokens_count)
+        return chunk_summary
+    @gpu_inference(duration=60)
+    def _synthesize_chunks_final(self, combined_content: str, selected_sections: list) -> str:
+        """Final synthesis of all chunks with GPU inference."""
+        try:
+            # Build synthesis prompt
+            sections_text = ""
+            if selected_sections:
+                for section_key in selected_sections:
+                    if section_key in VoxtralPrompts.AVAILABLE_SECTIONS:
+                        section = VoxtralPrompts.AVAILABLE_SECTIONS[section_key]
+                        sections_text += f"\n{section['title']}\n{section['description']}\n"
+            synthesis_prompt = f"""Here are detailed analyses from multiple meeting segments:
+{combined_content}
+CRITICAL INSTRUCTION - RESPONSE LANGUAGE:
+- DETECT the language used in the segments above
+- RESPOND OBLIGATORILY in the same detected language
+- If segments are in French → respond in French
+- If segments are in English → respond in English
+Now synthesize these analyses into a coherent global summary structured according to the requested sections:{sections_text}
+Provide a unified synthesis that combines and summarizes information from all segments coherently."""
+            # Generate synthesis
+            conversation = [{"role": "user", "content": synthesis_prompt}]
+            inputs = self.processor.apply_chat_template(conversation, return_tensors="pt")
+            device = self.gpu_manager.get_device()
+            dtype = self.gpu_manager.dtype if hasattr(self.gpu_manager, 'dtype') else torch.float16
+            # Move inputs to device with appropriate dtype
+            if hasattr(inputs, 'to'):
+                inputs = inputs.to(device, dtype=dtype)
+            else:
+                inputs = {k: v.to(device, dtype=dtype) if hasattr(v, 'to') else v for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=3000,  # Reduced for 8-bit efficiency
+                    temperature=0.1,
+                    do_sample=True,
+                    pad_token_id=self.processor.tokenizer.eos_token_id
+                )
+            # Decode synthesis
+            input_length = inputs.input_ids.shape[1]
+            output_tokens_count = outputs.shape[1] - input_length
+            final_synthesis = self.processor.tokenizer.decode(
+                outputs[0][input_length:],
+                skip_special_tokens=True
+            ).strip()
+            self.token_tracker.add_synthesis_tokens(input_length, output_tokens_count)
+            return f"# Global Meeting Summary\n\n{final_synthesis}\n\n---\n\n## Details by Segment\n\n{combined_content}"
+        except Exception as e:
+            print(f"❌ Error during final synthesis: {e}")
+            return f"# Meeting Summary\n\n⚠️ Error during final synthesis: {str(e)}\n\n## Segment Analyses\n\n{combined_content}"
+    def cleanup_model(self):
+        """Clean up model from memory."""
+        if self.model is not None:
+            self.model.to('cpu')
+            del self.model
+            self.model = None
+        if self.processor is not None:
+            del self.processor
+            self.processor = None
+        self.gpu_manager.cleanup_gpu()
+        print("🧹 Voxtral Spaces model cleaned up")

src/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """UI components for HF Spaces version."""

src/ui/spaces_interface.py ADDED Viewed

	@@ -0,0 +1,666 @@

+"""
+Application Gradio pour l'analyse intelligente de réunions avec Voxtral - Version HF Spaces.
+Version adaptée pour Hugging Face Spaces avec :
+- Uniquement mode Transformers (MLX et API supprimés)
+- Modèles 8-bit uniquement
+- Support MCP natif
+- Zero GPU decorators
+"""
+import os
+import gradio as gr
+from dotenv import load_dotenv
+from ..ai.voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
+from ..ai.diarization import SpeakerDiarization
+from ..utils.zero_gpu_manager import ZeroGPUManager, gpu_inference
+# Import labels from main project
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../../src'))
+from meetingnotes.ui.labels import UILabels
+# Charger les variables d'environnement depuis le fichier .env
+load_dotenv()
+# Global instances for MCP functions
+analyzer = None
+diarization = None
+gpu_manager = None
+current_diarization_context = None
+def initialize_components():
+    """Initialize global components for MCP functions."""
+    global analyzer, diarization, gpu_manager
+    if analyzer is None:
+        analyzer = VoxtralSpacesAnalyzer()
+        diarization = SpeakerDiarization()
+        gpu_manager = ZeroGPUManager()
+# MCP Tools - exposed automatically by Gradio
+@gpu_inference(duration=300)
+def analyze_meeting_audio(
+    audio_file: str,
+    sections: list = None,
+    model_name: str = "Voxtral-Mini-3B-2507",
+    enable_diarization: bool = False,
+    num_speakers: int = None
+) -> dict:
+    """
+    Analyze meeting audio and generate structured summaries using Voxtral AI.
+    This function processes audio files to extract insights, identify speakers,
+    and generate structured meeting summaries with configurable sections.
+    Args:
+        audio_file: Path to the audio file to analyze (MP3, WAV, M4A, OGG)
+        sections: List of analysis sections to include (executive_summary, action_plan, etc.)
+        model_name: Voxtral model to use for analysis (Mini-3B or Small-24B)
+        enable_diarization: Whether to identify and separate speakers
+        num_speakers: Expected number of speakers (optional, for better diarization)
+    Returns:
+        Dictionary containing analysis results, processing time, and metadata
+    """
+    initialize_components()
+    if not os.path.exists(audio_file):
+        return {"error": "Audio file not found", "status": "failed"}
+    try:
+        import time
+        start_time = time.time()
+        # Set default sections if none provided
+        if sections is None:
+            sections = ["resume_executif", "discussions_principales", "plan_action"]
+        # Speaker diarization if enabled
+        speaker_data = None
+        if enable_diarization:
+            rttm_result, reference_segments = diarization.diarize_audio(
+                audio_file, num_speakers=num_speakers
+            )
+            if not rttm_result.startswith("❌"):
+                speaker_data = rttm_result
+        # Set model if different
+        if analyzer.model_name != f"mistralai/{model_name}":
+            analyzer.model_name = f"mistralai/{model_name}"
+            analyzer.cleanup_model()
+        # Analyze audio
+        results = analyzer.analyze_audio_chunks(
+            wav_path=audio_file,
+            language="auto",
+            selected_sections=sections,
+            chunk_duration_minutes=15,
+            reference_speakers_data=speaker_data
+        )
+        processing_time = time.time() - start_time
+        return {
+            "status": "completed",
+            "analysis": results.get("transcription", "No analysis available"),
+            "processing_time_seconds": processing_time,
+            "model_used": model_name,
+            "sections_analyzed": sections,
+            "diarization_enabled": enable_diarization
+        }
+    except Exception as e:
+        return {
+            "status": "failed",
+            "error": str(e),
+            "processing_time_seconds": time.time() - start_time if 'start_time' in locals() else 0
+        }
+    finally:
+        if gpu_manager:
+            gpu_manager.cleanup_gpu()
+def get_available_sections() -> dict:
+    """Get available analysis sections for meeting summaries."""
+    from meetingnotes.ai.prompts_config import VoxtralPrompts
+    return {
+        "status": "success",
+        "sections": VoxtralPrompts.AVAILABLE_SECTIONS,
+        "total_sections": len(VoxtralPrompts.AVAILABLE_SECTIONS)
+    }
+def get_meeting_templates() -> dict:
+    """Get pre-configured meeting analysis templates."""
+    templates = {
+        "action_meeting": {
+            "name": "Action-Oriented Meeting",
+            "description": "For meetings focused on decisions and action items",
+            "recommended_sections": ["resume_executif", "discussions_principales", "plan_action", "decisions_prises", "prochaines_etapes"]
+        },
+        "info_meeting": {
+            "name": "Information Meeting",
+            "description": "For presentations and informational sessions",
+            "recommended_sections": ["resume_executif", "sujets_principaux", "points_importants", "questions_discussions", "elements_suivi"]
+        }
+    }
+    return {"status": "success", "templates": templates, "total_templates": len(templates)}
+# Handlers adaptés pour HF Spaces
+def handle_input_mode_change(input_mode):
+    """Gestion du changement de mode d'entrée."""
+    if input_mode == UILabels.INPUT_MODE_AUDIO:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+def extract_audio_from_video(video_file, language):
+    """Extraction audio depuis vidéo (placeholder pour HF Spaces)."""
+    if video_file is None:
+        return None, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
+    # Pour HF Spaces, on assume que le processing vidéo sera fait côté client
+    # ou qu'on accepte déjà des fichiers audio
+    return video_file, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
+@gpu_inference(duration=180)
+def handle_diarization(audio_file, hf_token, num_speakers, start_trim, end_trim):
+    """Gestion de la diarisation adaptée pour HF Spaces."""
+    global current_diarization_context
+    initialize_components()
+    if audio_file is None:
+        return gr.update(choices=[], visible=False), None, gr.update(visible=False)
+    try:
+        # Diarisation avec les paramètres
+        rttm_result, reference_segments = diarization.diarize_audio(
+            audio_file, num_speakers=num_speakers
+        )
+        if rttm_result.startswith("❌"):
+            return gr.update(choices=[], visible=False), None, gr.update(visible=False)
+        # Sauvegarder le contexte pour l'analyse principale
+        current_diarization_context = rttm_result
+        # Créer les boutons pour les locuteurs
+        speaker_choices = []
+        first_audio = None
+        for i, segment in enumerate(reference_segments):
+            speaker_id = segment['speaker']
+            speaker_choices.append((f"{speaker_id} ({segment['duration']:.1f}s)", speaker_id))
+            if i == 0:  # Premier audio pour l'aperçu
+                first_audio = segment['audio_path']
+        if speaker_choices:
+            return (
+                gr.update(choices=speaker_choices, value=speaker_choices[0][1], visible=True),
+                first_audio,
+                gr.update(visible=True)
+            )
+        else:
+            return gr.update(choices=[], visible=False), None, gr.update(visible=False)
+    except Exception as e:
+        print(f"Erreur diarisation: {e}")
+        return gr.update(choices=[], visible=False), None, gr.update(visible=False)
+def handle_speaker_selection(selected_speaker, current_name):
+    """Gestion de la sélection de locuteur."""
+    # Trouve le fichier audio correspondant au locuteur sélectionné
+    # Pour simplifier, on retourne juste un placeholder
+    return None, f"Locuteur_{selected_speaker}"
+def handle_speaker_rename(new_name):
+    """Gestion du renommage de locuteur."""
+    if new_name.strip():
+        renamed_info = f"Locuteur renommé: {new_name}"
+        return gr.update(value=renamed_info, visible=True), gr.update(visible=True)
+    return gr.update(visible=False), gr.update(visible=False)
+@gpu_inference(duration=300)
+def handle_direct_transcription(
+    audio_file, hf_token, language, transcription_mode, model_key,
+    selected_sections, diarization_data, start_trim, end_trim, chunk_duration
+):
+    """Gestion de l'analyse directe adaptée pour HF Spaces."""
+    initialize_components()
+    if audio_file is None:
+        return "", "❌ Veuillez d'abord télécharger un fichier audio."
+    try:
+        # Extraire le nom du modèle depuis transcription_mode
+        if "Mini" in transcription_mode:
+            model_name = "Voxtral-Mini-3B-2507"
+        else:
+            model_name = "Voxtral-Small-24B-2507"
+        # Configurer l'analyseur
+        if analyzer.model_name != f"mistralai/{model_name}":
+            analyzer.model_name = f"mistralai/{model_name}"
+            analyzer.cleanup_model()
+        # Lancer l'analyse
+        results = analyzer.analyze_audio_chunks(
+            wav_path=audio_file,
+            language="auto",
+            selected_sections=selected_sections,
+            chunk_duration_minutes=int(chunk_duration),
+            reference_speakers_data=diarization_data
+        )
+        return "", results.get("transcription", "Aucune analyse disponible")
+    except Exception as e:
+        error_msg = f"❌ Erreur lors de l'analyse: {str(e)}"
+        return "", error_msg
+    finally:
+        if gpu_manager:
+            gpu_manager.cleanup_gpu()
+def create_spaces_interface():
+    """
+    Point d'entrée principal pour l'interface HF Spaces.
+    Interface identique au projet original mais simplifiée :
+    - Seul mode Transformers (pas MLX/API)
+    - Modèles 8-bit uniquement
+    - Support MCP natif
+    """
+    # Initialize components
+    initialize_components()
+    # Récupérer le token Hugging Face depuis les variables d'environnement
+    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+    if hf_token is None:
+        print("⚠️ Warning: HF_TOKEN environment variable not found")
+    # Configuration du thème Glass personnalisé (identique à l'original)
+    custom_glass_theme = gr.themes.Glass(
+        primary_hue=gr.themes.colors.blue,
+        secondary_hue=gr.themes.colors.gray,
+        text_size=gr.themes.sizes.text_md,
+        spacing_size=gr.themes.sizes.spacing_md,
+        radius_size=gr.themes.sizes.radius_md
+    )
+    with gr.Blocks(
+        theme=custom_glass_theme,
+        title="MeetingNotes - AI Analysis with Voxtral",
+        css="""
+        .gradio-container {
+            max-width: 1200px !important;
+            margin: 0 auto !important;
+        }
+        .main-header {
+            text-align: center;
+            margin-bottom: 30px;
+            padding: 20px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            border-radius: 15px;
+            color: white;
+            box-shadow: 0 8px 32px rgba(31, 38, 135, 0.37);
+        }
+        .processing-section {
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            padding: 20px;
+            margin: 15px 0;
+            border: 1px solid rgba(255, 255, 255, 0.2);
+            backdrop-filter: blur(10px);
+        }
+        .results-section {
+            margin-top: 25px;
+        }
+        """
+    ) as demo:
+        # Main header with style (identique à l'original)
+        with gr.Column(elem_classes="main-header"):
+            gr.Markdown(
+                f"""
+                # {UILabels.MAIN_TITLE}
+                {UILabels.MAIN_SUBTITLE}
+                {UILabels.MAIN_DESCRIPTION}
+                """,
+                elem_classes="header-content"
+            )
+        # Processing mode section (SIMPLIFIÉ - seulement Transformers 8-bit)
+        with gr.Column(elem_classes="processing-section"):
+            gr.Markdown("## 🔧 Processing Configuration")
+            gr.Markdown("*HF Spaces version - Transformers backend with 8-bit quantization*")
+            # Model selection (seulement les modèles 8-bit)
+            with gr.Row():
+                with gr.Column():
+                    local_model_choice = gr.Radio(
+                        choices=[UILabels.MODEL_MINI, UILabels.MODEL_SMALL],
+                        value=UILabels.MODEL_MINI,
+                        label="Model Selection"
+                    )
+                with gr.Column():
+                    local_precision_choice = gr.Radio(
+                        choices=[UILabels.PRECISION_8BIT],
+                        value=UILabels.PRECISION_8BIT,
+                        label="Precision (Fixed for HF Spaces)"
+                    )
+        # Input mode selection (identique à l'original)
+        with gr.Column(elem_classes="processing-section"):
+            gr.Markdown(UILabels.INPUT_MODE_TITLE)
+            input_mode = gr.Radio(
+                choices=[UILabels.INPUT_MODE_AUDIO, UILabels.INPUT_MODE_VIDEO],
+                value=UILabels.INPUT_MODE_AUDIO,
+                label=UILabels.INPUT_MODE_LABEL
+            )
+        # Section Audio (mode par défaut) - identique à l'original
+        with gr.Column(elem_classes="processing-section") as audio_section:
+            gr.Markdown(UILabels.AUDIO_MODE_TITLE)
+            audio_input = gr.Audio(
+                label=UILabels.AUDIO_INPUT_LABEL,
+                type="filepath",
+                show_label=True,
+                interactive=True
+            )
+        # Section Vidéo (cachée par défaut) - identique à l'original
+        with gr.Column(elem_classes="processing-section", visible=False) as video_section:
+            gr.Markdown(UILabels.VIDEO_MODE_TITLE)
+            video_input = gr.File(
+                label=UILabels.VIDEO_INPUT_LABEL,
+                file_types=["video"]
+            )
+            btn_extract_audio = gr.Button(
+                UILabels.EXTRACT_AUDIO_BUTTON,
+                variant="secondary",
+                size="lg"
+            )
+        # Section options de trim (identique à l'original)
+        with gr.Column(elem_classes="processing-section"):
+            with gr.Accordion(UILabels.TRIM_OPTIONS_TITLE, open=False):
+                with gr.Row():
+                    start_trim_input = gr.Number(
+                        label=UILabels.START_TRIM_LABEL,
+                        value=0,
+                        minimum=0,
+                        maximum=3600
+                    )
+                    end_trim_input = gr.Number(
+                        label=UILabels.END_TRIM_LABEL,
+                        value=0,
+                        minimum=0,
+                        maximum=3600
+                    )
+        # Section diarisation (identique à l'original)
+        with gr.Column(elem_classes="processing-section"):
+            with gr.Accordion(UILabels.DIARIZATION_TITLE, open=False):
+                gr.Markdown(UILabels.DIARIZATION_DESCRIPTION)
+                with gr.Row():
+                    num_speakers_input = gr.Number(
+                        label=UILabels.NUM_SPEAKERS_LABEL,
+                        value=None,
+                        minimum=1,
+                        maximum=10,
+                        placeholder=UILabels.NUM_SPEAKERS_PLACEHOLDER
+                    )
+                btn_diarize = gr.Button(
+                    UILabels.DIARIZE_BUTTON,
+                    variant="secondary",
+                    size="lg"
+                )
+                # Section segments de référence
+                gr.Markdown(UILabels.REFERENCE_SEGMENTS_TITLE)
+                gr.Markdown(UILabels.REFERENCE_SEGMENTS_DESCRIPTION)
+                speaker_buttons = gr.Radio(
+                    label=UILabels.SPEAKERS_DETECTED_LABEL,
+                    choices=[],
+                    visible=False
+                )
+                reference_audio_player = gr.Audio(
+                    label=UILabels.REFERENCE_AUDIO_LABEL,
+                    type="filepath",
+                    interactive=False,
+                    visible=True
+                )
+                # Section renommage des locuteurs
+                with gr.Column(visible=False) as rename_section:
+                    gr.Markdown(UILabels.SPEAKER_RENAME_TITLE)
+                    with gr.Row():
+                        speaker_name_input = gr.Textbox(
+                            label=UILabels.SPEAKER_NAME_LABEL,
+                            placeholder=UILabels.SPEAKER_NAME_PLACEHOLDER
+                        )
+                    btn_apply_rename = gr.Button(
+                        UILabels.APPLY_RENAME_BUTTON,
+                        variant="primary",
+                        size="sm"
+                    )
+                    renamed_speakers_output = gr.Textbox(
+                        label=UILabels.IDENTIFIED_SPEAKERS_LABEL,
+                        value="",
+                        lines=5,
+                        interactive=False,
+                        visible=False
+                    )
+        # Section d'analyse principale (identique à l'original)
+        with gr.Column(elem_classes="processing-section"):
+            gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
+            gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
+            # Contrôle taille des chunks
+            chunk_duration_slider = gr.Slider(
+                minimum=5,
+                maximum=25,
+                value=15,
+                step=5,
+                label=UILabels.CHUNK_DURATION_LABEL
+            )
+            # Configuration des sections de résumé
+            gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
+            gr.Markdown(UILabels.SUMMARY_SECTIONS_DESCRIPTION)
+            # Boutons de présélection rapide
+            with gr.Row():
+                btn_preset_action = gr.Button(UILabels.PRESET_ACTION_BUTTON, variant="secondary", size="sm")
+                btn_preset_info = gr.Button(UILabels.PRESET_INFO_BUTTON, variant="secondary", size="sm")
+                btn_preset_complet = gr.Button(UILabels.PRESET_COMPLETE_BUTTON, variant="secondary", size="sm")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(UILabels.ACTION_SECTIONS_TITLE)
+                    section_resume_executif = gr.Checkbox(label=UILabels.SECTION_EXECUTIVE_SUMMARY, value=True)
+                    section_discussions = gr.Checkbox(label=UILabels.SECTION_MAIN_DISCUSSIONS, value=True)
+                    section_plan_action = gr.Checkbox(label=UILabels.SECTION_ACTION_PLAN, value=True)
+                    section_decisions = gr.Checkbox(label=UILabels.SECTION_DECISIONS, value=True)
+                    section_prochaines_etapes = gr.Checkbox(label=UILabels.SECTION_NEXT_STEPS, value=True)
+                with gr.Column():
+                    gr.Markdown(UILabels.INFO_SECTIONS_TITLE)
+                    section_sujets_principaux = gr.Checkbox(label=UILabels.SECTION_MAIN_TOPICS, value=False)
+                    section_points_importants = gr.Checkbox(label=UILabels.SECTION_KEY_POINTS, value=False)
+                    section_questions = gr.Checkbox(label=UILabels.SECTION_QUESTIONS, value=False)
+                    section_elements_suivi = gr.Checkbox(label=UILabels.SECTION_FOLLOW_UP, value=False)
+            btn_direct_transcribe = gr.Button(
+                UILabels.ANALYZE_BUTTON,
+                variant="primary",
+                size="lg"
+            )
+        # Section résultats (identique à l'original)
+        with gr.Column(elem_classes="results-section"):
+            gr.Markdown(UILabels.RESULTS_TITLE)
+            final_summary_output = gr.Markdown(
+                value=UILabels.RESULTS_PLACEHOLDER,
+                label=UILabels.RESULTS_LABEL,
+                height=500
+            )
+        # Event handlers (adaptés pour HF Spaces)
+        # Gestion du changement de mode d'entrée
+        input_mode.change(
+            fn=handle_input_mode_change,
+            inputs=[input_mode],
+            outputs=[audio_section, video_section]
+        )
+        # Extraction audio depuis vidéo
+        btn_extract_audio.click(
+            fn=extract_audio_from_video,
+            inputs=[video_input, gr.State("french")],
+            outputs=[audio_input, audio_section, video_section, input_mode, gr.State("french")]
+        )
+        # Fonctions de présélection des sections (identiques à l'original)
+        def preset_action():
+            return (True, True, True, True, True, False, False, False, False)
+        def preset_info():
+            return (True, False, False, False, False, True, True, True, True)
+        def preset_complet():
+            return (True, True, True, True, True, True, True, True, True)
+        # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
+        def handle_analysis_direct(
+            audio_file, hf_token, language, local_model, local_precision, start_trim, end_trim, chunk_duration,
+            s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
+            s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
+        ):
+            # Mode Transformers uniquement
+            transcription_mode = f"Transformers ({local_model} ({local_precision}))"
+            model_key = local_model
+            # Construire la liste des sections sélectionnées
+            sections_checkboxes = [
+                (s_resume, "resume_executif"),
+                (s_discussions, "discussions_principales"),
+                (s_plan_action, "plan_action"),
+                (s_decisions, "decisions_prises"),
+                (s_prochaines_etapes, "prochaines_etapes"),
+                (s_sujets_principaux, "sujets_principaux"),
+                (s_points_importants, "points_importants"),
+                (s_questions, "questions_discussions"),
+                (s_elements_suivi, "elements_suivi")
+            ]
+            selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
+            # Appeler la fonction d'analyse directe
+            _, summary = handle_direct_transcription(
+                audio_file, hf_token, language, transcription_mode,
+                model_key, selected_sections, current_diarization_context, start_trim, end_trim, chunk_duration
+            )
+            return summary
+        # Événements de présélection (identiques à l'original)
+        btn_preset_action.click(
+            fn=preset_action,
+            outputs=[
+                section_resume_executif, section_discussions, section_plan_action,
+                section_decisions, section_prochaines_etapes, section_sujets_principaux,
+                section_points_importants, section_questions, section_elements_suivi
+            ]
+        )
+        btn_preset_info.click(
+            fn=preset_info,
+            outputs=[
+                section_resume_executif, section_discussions, section_plan_action,
+                section_decisions, section_prochaines_etapes, section_sujets_principaux,
+                section_points_importants, section_questions, section_elements_suivi
+            ]
+        )
+        btn_preset_complet.click(
+            fn=preset_complet,
+            outputs=[
+                section_resume_executif, section_discussions, section_plan_action,
+                section_decisions, section_prochaines_etapes, section_sujets_principaux,
+                section_points_importants, section_questions, section_elements_suivi
+            ]
+        )
+        # Analyse principale (adaptée pour HF Spaces)
+        btn_direct_transcribe.click(
+            fn=handle_analysis_direct,
+            inputs=[
+                audio_input,
+                gr.State(value=hf_token),
+                gr.State("french"),
+                local_model_choice,
+                local_precision_choice,
+                start_trim_input,
+                end_trim_input,
+                chunk_duration_slider,
+                section_resume_executif,
+                section_discussions,
+                section_plan_action,
+                section_decisions,
+                section_prochaines_etapes,
+                section_sujets_principaux,
+                section_points_importants,
+                section_questions,
+                section_elements_suivi
+            ],
+            outputs=[final_summary_output]
+        )
+        # Gestion de la diarisation (adaptée pour HF Spaces)
+        btn_diarize.click(
+            fn=handle_diarization,
+            inputs=[audio_input, gr.State(value=hf_token), num_speakers_input, start_trim_input, end_trim_input],
+            outputs=[speaker_buttons, reference_audio_player, rename_section]
+        )
+        # Gestion de la sélection de locuteur
+        speaker_buttons.change(
+            fn=handle_speaker_selection,
+            inputs=[speaker_buttons, speaker_name_input],
+            outputs=[reference_audio_player, speaker_name_input]
+        )
+        # Gestion du renommage
+        btn_apply_rename.click(
+            fn=handle_speaker_rename,
+            inputs=[speaker_name_input],
+            outputs=[renamed_speakers_output, renamed_speakers_output]
+        )
+        # Footer (identique à l'original)
+        with gr.Row():
+            gr.Markdown(
+                """
+                ---
+                **MeetingNotes** | Powered by [Voxtral](https://mistral.ai/) |
+                🚀 Intelligent meeting analysis | 💾 HF Spaces with Zero GPU
+                """,
+                elem_classes="footer-info"
+            )
+    return demo

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Utilities for HF Spaces version."""
+from .zero_gpu_manager import ZeroGPUManager, gpu_inference, gpu_model_loading, gpu_long_task
+from .token_tracker import TokenTracker
+__all__ = ['ZeroGPUManager', 'gpu_inference', 'gpu_model_loading', 'gpu_long_task', 'TokenTracker']

src/utils/token_tracker.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Token usage tracking utility for MeetingNotes HF Spaces.
+This module provides a centralized way to track and report token consumption
+for Transformers-based processing in HF Spaces environment.
+"""
+class TokenTracker:
+    """
+    Centralized token usage tracking for HF Spaces.
+    Tracks input and output tokens across different chunks and processing modes
+    to provide comprehensive usage statistics.
+    """
+    def __init__(self, mode: str = "Transformers-8bit"):
+        self.mode = mode
+        self.reset()
+    def reset(self):
+        """Reset all counters."""
+        self.chunks_processed = 0
+        self.total_input_tokens = 0
+        self.total_output_tokens = 0
+        self.synthesis_input_tokens = 0
+        self.synthesis_output_tokens = 0
+    def set_mode(self, mode: str):
+        """Set the processing mode for reporting."""
+        self.mode = mode
+    def add_chunk_tokens(self, input_tokens: int, output_tokens: int):
+        """Add tokens from a chunk processing."""
+        self.chunks_processed += 1
+        self.total_input_tokens += input_tokens
+        self.total_output_tokens += output_tokens
+        print(f"📊 Stats {self.mode} Chunk {self.chunks_processed} - Input: {input_tokens} tokens, Output: {output_tokens} tokens")
+    def add_synthesis_tokens(self, input_tokens: int, output_tokens: int):
+        """Add tokens from synthesis processing."""
+        self.synthesis_input_tokens = input_tokens
+        self.synthesis_output_tokens = output_tokens
+        print(f"📊 Stats {self.mode} Synthesis - Input: {input_tokens} tokens, Output: {output_tokens} tokens")
+    def print_summary(self):
+        """Print final token usage summary."""
+        total_input = self.total_input_tokens + self.synthesis_input_tokens
+        total_output = self.total_output_tokens + self.synthesis_output_tokens
+        grand_total = total_input + total_output
+        print(f"\n📊 === TOKEN USAGE SUMMARY ({self.mode}) ===")
+        print(f"📦 Chunks processed: {self.chunks_processed}")
+        print(f"📥 Total input tokens: {total_input:,}")
+        print(f"📤 Total output tokens: {total_output:,}")
+        print(f"🔢 Grand total: {grand_total:,} tokens")
+        if self.synthesis_input_tokens > 0:
+            print(f"   • Chunk analysis: {self.total_input_tokens + self.total_output_tokens:,} tokens")
+            print(f"   • Final synthesis: {self.synthesis_input_tokens + self.synthesis_output_tokens:,} tokens")
+        print("=" * 50)

src/utils/zero_gpu_manager.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Zero GPU management for Hugging Face Spaces.
+This module provides decorators and utilities for efficient GPU usage
+in HF Spaces environment with automatic resource management.
+"""
+import functools
+import gc
+import os
+import torch
+from typing import Callable, Any
+# Import spaces if available (HF Spaces environment)
+try:
+    import spaces
+except ImportError:
+    spaces = None
+class ZeroGPUManager:
+    """Manager for Zero GPU operations in HF Spaces."""
+    def __init__(self):
+        # Device selection with MPS support for local Mac testing
+        if torch.backends.mps.is_available():
+            self.device = "mps"
+            self.dtype = torch.float16  # MPS works better with float16
+            print("🚀 Using MPS (Apple Silicon) for local testing")
+        elif torch.cuda.is_available():
+            self.device = "cuda"
+            self.dtype = torch.bfloat16  # CUDA supports bfloat16
+            print("🚀 Using CUDA GPU")
+        else:
+            self.device = "cpu"
+            self.dtype = torch.float16  # CPU with float16 to save memory
+            print("⚠️ Using CPU")
+        self.is_spaces = os.getenv("SPACE_ID") is not None
+    @staticmethod
+    def gpu_task(duration: int = 60):
+        """
+        Decorator for GPU-intensive tasks.
+        Args:
+            duration: Expected duration in seconds for GPU allocation
+        """
+        def decorator(func: Callable) -> Callable:
+            if spaces is not None and hasattr(spaces, 'GPU'):
+                # Use HF Spaces GPU decorator
+                return spaces.GPU(duration=duration)(func)
+            else:
+                # Fallback for local development
+                return func
+        return decorator
+    @staticmethod
+    def cleanup_gpu():
+        """Clean up GPU memory after processing (CUDA/MPS/CPU)."""
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+        elif torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+    def get_device(self) -> str:
+        """Get the appropriate device for processing."""
+        return self.device
+    def is_gpu_available(self) -> bool:
+        """Check if GPU (CUDA or MPS) is available."""
+        return torch.cuda.is_available() or torch.backends.mps.is_available()
+    def is_spaces_environment(self) -> bool:
+        """Check if running in HF Spaces environment."""
+        return self.is_spaces
+    def get_memory_info(self) -> dict:
+        """Get current GPU memory information (CUDA or MPS)."""
+        if torch.cuda.is_available():
+            return {
+                "available": True,
+                "device": "cuda",
+                "allocated": torch.cuda.memory_allocated(),
+                "cached": torch.cuda.memory_reserved(),
+                "total": torch.cuda.get_device_properties(0).total_memory
+            }
+        elif torch.backends.mps.is_available():
+            return {
+                "available": True,
+                "device": "mps",
+                "allocated": torch.mps.current_allocated_memory(),
+                "driver_allocated": torch.mps.driver_allocated_memory(),
+                # MPS doesn't have total memory info readily available
+                "total": "N/A (MPS)"
+            }
+        else:
+            return {"available": False, "device": "cpu"}
+# Convenience decorators
+def gpu_inference(duration: int = 60):
+    """Decorator for GPU inference tasks."""
+    return ZeroGPUManager.gpu_task(duration=duration)
+def gpu_model_loading(duration: int = 120):
+    """Decorator for GPU model loading tasks."""
+    return ZeroGPUManager.gpu_task(duration=duration)
+def gpu_long_task(duration: int = 300):
+    """Decorator for long GPU processing tasks."""
+    return ZeroGPUManager.gpu_task(duration=duration)