Spaces:

yasserrmd
/

VibeVoice

Running on Zero

App Files Files Community

yasserrmd commited on Aug 26

Commit

f736395

verified ·

1 Parent(s): 9a86201

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -69

app.py CHANGED Viewed

@@ -69,86 +69,148 @@ class VibeVoiceDemo:
             return np.array([])
     @GPU
-    def generate_podcast(self, num_speakers: int, script: str,
-                         speaker_1: str = None, speaker_2: str = None,
-                         speaker_3: str = None, speaker_4: str = None,
-                         cfg_scale: float = 1.3):
-        """Final audio generation only (no streaming)."""
-        self.is_generating = True
-        if not script.strip():
-            raise gr.Error("Please provide a script.")
-        if num_speakers < 1 or num_speakers > 4:
-            raise gr.Error("Number of speakers must be 1–4.")
-        selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
-        for i, sp in enumerate(selected):
-            if not sp or sp not in self.available_voices:
-                raise gr.Error(f"Invalid speaker {i+1} selection.")
-        # load voices
-        voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
-        if any(len(v) == 0 for v in voice_samples):
-            raise gr.Error("Failed to load one or more voice samples.")
-        # format script
-        lines = script.strip().split("\n")
-        formatted = []
-        for i, line in enumerate(lines):
-            line = line.strip()
-            if not line:
-                continue
-            if line.startswith("Speaker "):
-                formatted.append(line)
-            else:
-                sp_id = i % num_speakers
-                formatted.append(f"Speaker {sp_id}: {line}")
-        formatted_script = "\n".join(formatted)
-        # processor input
-        inputs = self.processor(
-            text=[formatted_script],
-            voice_samples=[voice_samples],
-            padding=True,
-            return_tensors="pt",
-            return_attention_mask=True,
-        )
-        start = time.time()
-        outputs = self.model.generate(
-            **inputs,
-            max_new_tokens=None,
-            cfg_scale=cfg_scale,
-            tokenizer=self.processor.tokenizer,
-            generation_config={'do_sample': False},
-            verbose=False,
-        )
-        # --- FIX: pull from speech_outputs ---
-        if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
-            audio = outputs.speech_outputs[0].cpu().numpy()
-        else:
-            self.is_generating = False
-            raise gr.Error("❌ No audio was generated by the model.")
-        if audio.ndim > 1:
-            audio = audio.squeeze()
-        sample_rate = 24000
-        # Save automatically to disk
-        os.makedirs("outputs", exist_ok=True)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
-        sf.write(file_path, audio, sample_rate)
-        print(f"💾 Saved podcast to {file_path}")
-        total_dur = len(audio) / sample_rate
-        log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
-        self.is_generating = False
-        return (sample_rate, audio), log

             return np.array([])
     @GPU
+    def generate_podcast(self,
+                     num_speakers: int,
+                     script: str,
+                     speaker_1: str = None,
+                     speaker_2: str = None,
+                     speaker_3: str = None,
+                     speaker_4: str = None,
+                     cfg_scale: float = 1.3):
+        """
+        Generates a podcast as a single audio file from a script and saves it.
+        This is a non-streaming function.
+        """
+        try:
+            # 1. Set generating state and validate inputs
+            self.is_generating = True
+            if not script.strip():
+                raise gr.Error("Error: Please provide a script.")
+            # Defend against common mistake with apostrophes
+            script = script.replace("’", "'")
+            if not 1 <= num_speakers <= 4:
+                raise gr.Error("Error: Number of speakers must be between 1 and 4.")
+            # 2. Collect and validate selected speakers
+            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+            for i, speaker_name in enumerate(selected_speakers):
+                if not speaker_name or speaker_name not in self.available_voices:
+                    raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
+            # 3. Build initial log
+            log = f"🎙️ Generating podcast with {num_speakers} speakers\n"
+            log += f"📊 Parameters: CFG Scale={cfg_scale}\n"
+            log += f"🎭 Speakers: {', '.join(selected_speakers)}\n"
+            # 4. Load voice samples
+            voice_samples = []
+            for speaker_name in selected_speakers:
+                audio_path = self.available_voices[speaker_name]
+                # Assuming self.read_audio is a method in your class that returns audio data
+                audio_data = self.read_audio(audio_path)
+                if len(audio_data) == 0:
+                    raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
+                voice_samples.append(audio_data)
+            log += f"✅ Loaded {len(voice_samples)} voice samples\n"
+            # 5. Parse and format the script
+            lines = script.strip().split('\n')
+            formatted_script_lines = []
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue
+                # Check if line already has speaker format (e.g., "Speaker 1: ...")
+                if line.startswith('Speaker ') and ':' in line:
+                    formatted_script_lines.append(line)
+                else:
+                    # Auto-assign speakers in rotation
+                    speaker_id = len(formatted_script_lines) % num_speakers
+                    formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
+            formatted_script = '\n'.join(formatted_script_lines)
+            log += f"📝 Formatted script with {len(formatted_script_lines)} turns\n"
+            log += "🔄 Processing with VibeVoice...\n"
+            # 6. Prepare inputs for the model
+            # Assuming self.processor is an object available in your class
+            inputs = self.processor(
+                text=[formatted_script],
+                voice_samples=[voice_samples],
+                padding=True,
+                return_tensors="pt",
+                return_attention_mask=True,
+            )
+            # 7. Generate audio
+            start_time = time.time()
+            # Assuming self.model is an object available in your class
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=None,
+                cfg_scale=cfg_scale,
+                tokenizer=self.processor.tokenizer,
+                generation_config={'do_sample': False},
+                verbose=False, # Verbose is off for cleaner logs
+            )
+            generation_time = time.time() - start_time
+            # 8. Extract audio output
+            # The generated audio is often in speech_outputs or a similar attribute
+            if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
+                audio_tensor = outputs.speech_outputs[0]
+                audio = audio_tensor.cpu().numpy()
+            else:
+                raise gr.Error("❌ Error: No audio was generated by the model. Please try again.")
+            # Ensure audio is a 1D array
+            if audio.ndim > 1:
+                audio = audio.squeeze()
+            sample_rate = 24000 # Standard sample rate for this model
+            # 9. Save the audio file
+            output_dir = "outputs"
+            os.makedirs(output_dir, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            file_path = os.path.join(output_dir, f"podcast_{timestamp}.wav")
+            # Write the NumPy array to a WAV file
+            sf.write(file_path, audio, sample_rate)
+            print(f"💾 Podcast saved to {file_path}")
+            # 10. Finalize log and return
+            total_duration = len(audio) / sample_rate
+            log += f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
+            log += f"🎵 Final audio duration: {total_duration:.2f} seconds\n"
+            log += f"✅ Successfully saved podcast to: {file_path}\n"
+            self.is_generating = False
+            return (sample_rate, audio), log
+        except gr.Error as e:
+            # Handle Gradio-specific errors (for user feedback)
+            self.is_generating = False
+            error_msg = f"❌ Input Error: {str(e)}"
+            print(error_msg)
+            # In Gradio, you would typically return an update to the UI
+            # For a pure function, we re-raise or handle it as needed.
+            # This return signature matches the success case but with error info.
+            return None, error_msg
+        except Exception as e:
+            # Handle all other unexpected errors
+            self.is_generating = False
+            error_msg = f"❌ An unexpected error occurred: {str(e)}"
+            print(error_msg)
+            import traceback
+            traceback.print_exc()
+            return None, error_msg