|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
from voxcpm.model import VoxCPMModel |
|
|
from voxcpm.model.voxcpm import LoRAConfig |
|
|
from voxcpm.training.config import load_yaml_config |
|
|
import argparse |
|
|
import torch |
|
|
import os |
|
|
import re |
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--lora_ckpt", type=str, required=True) |
|
|
parser.add_argument("--lora_config_path", type=str, required=True) |
|
|
parser.add_argument("--text", type=str) |
|
|
parser.add_argument("--text_file", type=str) |
|
|
parser.add_argument("--output_dir", type=str, default="outputs") |
|
|
parser.add_argument("--cfg_value", type=float, default=2.0) |
|
|
parser.add_argument("--inference_timesteps", type=int, default=10) |
|
|
args = parser.parse_args() |
|
|
assert args.text or args.text_file, "Please provide either text or text_file" |
|
|
|
|
|
|
|
|
cfg = load_yaml_config(args.lora_config_path) |
|
|
pretrained_path = cfg["pretrained_path"] |
|
|
lora_cfg_dict = cfg.get("lora", {}) or {} |
|
|
lora_cfg = LoRAConfig(**lora_cfg_dict) if lora_cfg_dict else None |
|
|
|
|
|
|
|
|
print(f"[1/3] 加载基础模型:{pretrained_path}") |
|
|
model = VoxCPMModel.from_local( |
|
|
pretrained_path, |
|
|
optimize=True, |
|
|
training=False, |
|
|
lora_config=lora_cfg, |
|
|
) |
|
|
|
|
|
from src.voxcpm.utils.text_normalize import TextNormalizer |
|
|
text_normalizer = TextNormalizer() |
|
|
|
|
|
|
|
|
ckpt_dir = Path(args.lora_ckpt) |
|
|
if not ckpt_dir.exists(): |
|
|
raise FileNotFoundError(f"找不到 LoRA checkpoint: {ckpt_dir}") |
|
|
|
|
|
print(f"[2/3] 加载 LoRA 权重:{ckpt_dir}") |
|
|
loaded, skipped = model.load_lora_weights(str(ckpt_dir)) |
|
|
print(f" 已加载 {len(loaded)} 个参数") |
|
|
if skipped: |
|
|
print(f"[WARNING] 跳过 {len(skipped)} 个参数") |
|
|
print(f" 跳过的 key (前5个): {skipped[:5]}") |
|
|
print(f"\n[3/3] 开始推理...") |
|
|
if args.text: |
|
|
with torch.inference_mode(): |
|
|
target_text = args.text.replace("\n", " ") |
|
|
target_text = re.sub(r'\s+', ' ', target_text) |
|
|
target_text = text_normalizer.normalize(target_text) |
|
|
wav = model.generate( |
|
|
target_text=target_text, |
|
|
cfg_value=args.cfg_value, |
|
|
inference_timesteps=args.inference_timesteps, |
|
|
retry_badcase=True, |
|
|
retry_badcase_max_times=3, |
|
|
retry_badcase_ratio_threshold=6.0, |
|
|
) |
|
|
audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy() |
|
|
if not os.path.exists(args.output_dir): |
|
|
os.makedirs(args.output_dir) |
|
|
sf.write(f"{args.output_dir}/output_lora.wav", audio_np, 16000) |
|
|
print(f"saved: {args.output_dir}/output_lora.wav") |
|
|
elif args.text_file: |
|
|
texts = [] |
|
|
with open(args.text_file, "r") as f: |
|
|
lines = f.readlines() |
|
|
for line in lines: |
|
|
line = line.strip().split("||") |
|
|
wav_id = line[0] |
|
|
text = " ".join(line[1:]) |
|
|
texts.append((wav_id, text)) |
|
|
for wav_id, text in texts: |
|
|
with torch.inference_mode(): |
|
|
target_text = text.replace("\n", " ") |
|
|
target_text = re.sub(r'\s+', ' ', target_text) |
|
|
target_text = text_normalizer.normalize(target_text) |
|
|
wav = model.generate( |
|
|
target_text=target_text, |
|
|
cfg_value=args.cfg_value, |
|
|
inference_timesteps=args.inference_timesteps, |
|
|
retry_badcase=True, |
|
|
retry_badcase_max_times=3, |
|
|
retry_badcase_ratio_threshold=6.0, |
|
|
) |
|
|
audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy() |
|
|
if not os.path.exists(args.output_dir): |
|
|
os.makedirs(args.output_dir) |
|
|
sf.write(f"{args.output_dir}/{wav_id}.wav", audio_np, 16000) |
|
|
print(f"saved: {args.output_dir}/{wav_id}.wav") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|