| |
| """ |
| VoxCPM Command Line Interface |
| |
| Unified CLI for voice cloning, direct TTS synthesis, and batch processing. |
| """ |
|
|
| import argparse |
| import os |
| import sys |
| from pathlib import Path |
| import soundfile as sf |
|
|
| from voxcpm.core import VoxCPM |
|
|
|
|
| |
| |
| |
|
|
| def validate_file_exists(file_path: str, file_type: str = "file") -> Path: |
| path = Path(file_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"{file_type} '{file_path}' does not exist") |
| return path |
|
|
|
|
| def validate_output_path(output_path: str) -> Path: |
| path = Path(output_path) |
| path.parent.mkdir(parents=True, exist_ok=True) |
| return path |
|
|
|
|
| def validate_ranges(args, parser): |
| """Validate numeric argument ranges.""" |
| if not (0.1 <= args.cfg_value <= 10.0): |
| parser.error("--cfg-value must be between 0.1 and 10.0") |
|
|
| if not (1 <= args.inference_timesteps <= 100): |
| parser.error("--inference-timesteps must be between 1 and 100") |
|
|
| if args.lora_r <= 0: |
| parser.error("--lora-r must be a positive integer") |
|
|
| if args.lora_alpha <= 0: |
| parser.error("--lora-alpha must be a positive integer") |
|
|
| if not (0.0 <= args.lora_dropout <= 1.0): |
| parser.error("--lora-dropout must be between 0.0 and 1.0") |
|
|
|
|
| |
| |
| |
|
|
| def load_model(args) -> VoxCPM: |
| print("Loading VoxCPM model...", file=sys.stderr) |
|
|
| zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get( |
| "ZIPENHANCER_MODEL_PATH", None |
| ) |
|
|
| |
| lora_config = None |
| lora_weights_path = getattr(args, "lora_path", None) |
| if lora_weights_path: |
| from voxcpm.model.voxcpm import LoRAConfig |
|
|
| lora_config = LoRAConfig( |
| enable_lm=not args.lora_disable_lm, |
| enable_dit=not args.lora_disable_dit, |
| enable_proj=args.lora_enable_proj, |
| r=args.lora_r, |
| alpha=args.lora_alpha, |
| dropout=args.lora_dropout, |
| ) |
|
|
| print( |
| f"LoRA config: r={lora_config.r}, alpha={lora_config.alpha}, " |
| f"lm={lora_config.enable_lm}, dit={lora_config.enable_dit}, proj={lora_config.enable_proj}", |
| file=sys.stderr, |
| ) |
|
|
| |
| if args.model_path: |
| try: |
| model = VoxCPM( |
| voxcpm_model_path=args.model_path, |
| zipenhancer_model_path=zipenhancer_path, |
| enable_denoiser=not args.no_denoiser, |
| lora_config=lora_config, |
| lora_weights_path=lora_weights_path, |
| ) |
| print("Model loaded (local).", file=sys.stderr) |
| return model |
| except Exception as e: |
| print(f"Failed to load model (local): {e}", file=sys.stderr) |
| sys.exit(1) |
|
|
| |
| try: |
| model = VoxCPM.from_pretrained( |
| hf_model_id=args.hf_model_id, |
| load_denoiser=not args.no_denoiser, |
| zipenhancer_model_id=zipenhancer_path, |
| cache_dir=args.cache_dir, |
| local_files_only=args.local_files_only, |
| lora_config=lora_config, |
| lora_weights_path=lora_weights_path, |
| ) |
| print("Model loaded (from_pretrained).", file=sys.stderr) |
| return model |
| except Exception as e: |
| print(f"Failed to load model (from_pretrained): {e}", file=sys.stderr) |
| sys.exit(1) |
|
|
|
|
| |
| |
| |
|
|
| def cmd_clone(args): |
| if not args.text: |
| sys.exit("Error: Please provide --text for synthesis") |
|
|
| if not args.prompt_audio or not args.prompt_text: |
| sys.exit("Error: Voice cloning requires both --prompt-audio and --prompt-text") |
|
|
| prompt_audio_path = validate_file_exists(args.prompt_audio, "reference audio file") |
| output_path = validate_output_path(args.output) |
|
|
| model = load_model(args) |
|
|
| audio_array = model.generate( |
| text=args.text, |
| prompt_wav_path=str(prompt_audio_path), |
| prompt_text=args.prompt_text, |
| cfg_value=args.cfg_value, |
| inference_timesteps=args.inference_timesteps, |
| normalize=args.normalize, |
| denoise=args.denoise, |
| ) |
|
|
| sf.write(str(output_path), audio_array, model.tts_model.sample_rate) |
|
|
| duration = len(audio_array) / model.tts_model.sample_rate |
| print(f"Saved audio to: {output_path} ({duration:.2f}s)", file=sys.stderr) |
|
|
|
|
| def cmd_synthesize(args): |
| if not args.text: |
| sys.exit("Error: Please provide --text for synthesis") |
|
|
| output_path = validate_output_path(args.output) |
| model = load_model(args) |
|
|
| audio_array = model.generate( |
| text=args.text, |
| prompt_wav_path=None, |
| prompt_text=None, |
| cfg_value=args.cfg_value, |
| inference_timesteps=args.inference_timesteps, |
| normalize=args.normalize, |
| denoise=False, |
| ) |
|
|
| sf.write(str(output_path), audio_array, model.tts_model.sample_rate) |
|
|
| duration = len(audio_array) / model.tts_model.sample_rate |
| print(f"Saved audio to: {output_path} ({duration:.2f}s)", file=sys.stderr) |
|
|
|
|
| def cmd_batch(args): |
| input_file = validate_file_exists(args.input, "input file") |
| output_dir = Path(args.output_dir) |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| with open(input_file, "r", encoding="utf-8") as f: |
| texts = [line.strip() for line in f if line.strip()] |
|
|
| if not texts: |
| sys.exit("Error: Input file is empty") |
|
|
| model = load_model(args) |
|
|
| prompt_audio_path = None |
| if args.prompt_audio: |
| prompt_audio_path = str(validate_file_exists(args.prompt_audio, "reference audio file")) |
|
|
| success_count = 0 |
|
|
| for i, text in enumerate(texts, 1): |
| try: |
| audio_array = model.generate( |
| text=text, |
| prompt_wav_path=prompt_audio_path, |
| prompt_text=args.prompt_text, |
| cfg_value=args.cfg_value, |
| inference_timesteps=args.inference_timesteps, |
| normalize=args.normalize, |
| denoise=args.denoise and prompt_audio_path is not None, |
| ) |
|
|
| output_file = output_dir / f"output_{i:03d}.wav" |
| sf.write(str(output_file), audio_array, model.tts_model.sample_rate) |
|
|
| duration = len(audio_array) / model.tts_model.sample_rate |
| print(f"Saved: {output_file} ({duration:.2f}s)", file=sys.stderr) |
| success_count += 1 |
|
|
| except Exception as e: |
| print(f"Failed on line {i}: {e}", file=sys.stderr) |
|
|
| print(f"\nBatch finished: {success_count}/{len(texts)} succeeded", file=sys.stderr) |
|
|
|
|
| |
| |
| |
|
|
| def _build_unified_parser(): |
| parser = argparse.ArgumentParser( |
| description="VoxCPM CLI - voice cloning, direct TTS, and batch processing", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| voxcpm --text "Hello world" --output out.wav |
| voxcpm --text "Hello" --prompt-audio ref.wav --prompt-text "hi" --output out.wav --denoise |
| voxcpm --input texts.txt --output-dir ./outs |
| """, |
| ) |
|
|
| |
| parser.add_argument("--input", "-i", help="Input text file (batch mode only)") |
| parser.add_argument("--output-dir", "-od", help="Output directory (batch mode only)") |
| parser.add_argument("--text", "-t", help="Text to synthesize (single or clone mode)") |
| parser.add_argument("--output", "-o", help="Output audio file path (single or clone mode)") |
|
|
| |
| parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path (clone mode)") |
| parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio") |
| parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement") |
|
|
| |
| parser.add_argument("--cfg-value", type=float, default=2.0, |
| help="CFG guidance scale (float, recommended 0.5–5.0, default: 2.0)") |
| parser.add_argument("--inference-timesteps", type=int, default=10, |
| help="Inference steps (int, 1–100, default: 10)") |
| parser.add_argument("--normalize", action="store_true", help="Enable text normalization") |
|
|
| |
| parser.add_argument("--model-path", type=str, help="Local VoxCPM model path") |
| parser.add_argument("--hf-model-id", type=str, default="openbmb/VoxCPM1.5", |
| help="Hugging Face repo id (default: openbmb/VoxCPM1.5)") |
| parser.add_argument("--cache-dir", type=str, help="Cache directory for Hub downloads") |
| parser.add_argument("--local-files-only", action="store_true", help="Disable network access") |
| parser.add_argument("--no-denoiser", action="store_true", help="Disable denoiser model loading") |
| parser.add_argument("--zipenhancer-path", type=str, |
| help="ZipEnhancer model id or local path (or env ZIPENHANCER_MODEL_PATH)") |
|
|
| |
| parser.add_argument("--lora-path", type=str, help="Path to LoRA weights") |
| parser.add_argument("--lora-r", type=int, default=32, help="LoRA rank (positive int, default: 32)") |
| parser.add_argument("--lora-alpha", type=int, default=16, help="LoRA alpha (positive int, default: 16)") |
| parser.add_argument("--lora-dropout", type=float, default=0.0, |
| help="LoRA dropout rate (0.0–1.0, default: 0.0)") |
| parser.add_argument("--lora-disable-lm", action="store_true", help="Disable LoRA on LM layers") |
| parser.add_argument("--lora-disable-dit", action="store_true", help="Disable LoRA on DiT layers") |
| parser.add_argument("--lora-enable-proj", action="store_true", help="Enable LoRA on projection layers") |
|
|
| return parser |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = _build_unified_parser() |
| args = parser.parse_args() |
|
|
| |
| validate_ranges(args, parser) |
|
|
| |
| if args.input and args.text: |
| parser.error("Use either batch mode (--input) or single mode (--text), not both.") |
|
|
| |
| if args.input: |
| if not args.output_dir: |
| parser.error("Batch mode requires --output-dir") |
| return cmd_batch(args) |
|
|
| |
| if not args.text or not args.output: |
| parser.error("Single-sample mode requires --text and --output") |
|
|
| |
| if args.prompt_audio or args.prompt_text: |
| return cmd_clone(args) |
|
|
| |
| return cmd_synthesize(args) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|