|
|
| """
|
| Simple example of using the pre-quantized VibeVoice model
|
| No need for on-the-fly quantization - loads much faster!
|
| """
|
|
|
| import os
|
| import torch
|
| from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
|
|
| def main():
|
|
|
| model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
|
|
|
| print("Loading pre-quantized VibeVoice 4-bit model...")
|
|
|
|
|
| processor = VibeVoiceProcessor.from_pretrained(model_path)
|
|
|
|
|
|
|
| model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| model_path,
|
| device_map='cuda',
|
| torch_dtype=torch.bfloat16,
|
| )
|
| model.eval()
|
|
|
|
|
| memory_gb = torch.cuda.memory_allocated() / 1e9
|
| print(f"✅ Model loaded! Memory usage: {memory_gb:.1f} GB")
|
|
|
|
|
| text = "Speaker 1: Welcome to our podcast! Speaker 2: Thanks for having me!"
|
|
|
|
|
| voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
|
| speaker_voices = [
|
| os.path.join(voices_dir, "en-Alice_woman.wav"),
|
| os.path.join(voices_dir, "en-Carter_man.wav")
|
| ]
|
|
|
|
|
| inputs = processor(
|
| text=[text],
|
| voice_samples=[speaker_voices],
|
| padding=True,
|
| return_tensors="pt",
|
| return_attention_mask=True,
|
| )
|
|
|
|
|
| print(f"\nGenerating: '{text}'")
|
| with torch.no_grad():
|
| outputs = model.generate(
|
| **inputs,
|
| max_new_tokens=None,
|
| cfg_scale=1.3,
|
| tokenizer=processor.tokenizer,
|
| generation_config={'do_sample': False},
|
| )
|
|
|
|
|
| output_path = "quantized_output.wav"
|
| processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
|
| print(f"✅ Audio saved to: {output_path}")
|
|
|
| if __name__ == "__main__":
|
| main() |