Ollama_TTS_RVC / infer_rvc.py
MoiMoi-01's picture
Create infer_rvc.py
4a94250 verified
import torch
import librosa
import numpy as np
import soundfile as sf
import argparse
import os
def load_rvc_model(model_path):
"""Load the RVC voice conversion model."""
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model not found: {model_path}")
model = torch.load(model_path, map_location="cuda" if torch.cuda.is_available() else "cpu")
model.eval()
return model
def convert_voice(input_audio, output_audio, model_path, index_path, pitch_shift=0):
"""Convert input audio using RVC."""
model = load_rvc_model(model_path)
audio, sr = librosa.load(input_audio, sr=44100)
# Apply conversion
audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
with torch.no_grad():
converted_audio = model(audio) # Convert using RVC model
# Save output
sf.write(output_audio, converted_audio.numpy().squeeze(), sr)
print(f"Converted voice saved to {output_audio}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="Input TTS audio file")
parser.add_argument("--output", required=True, help="Output converted audio file")
parser.add_argument("--model", required=True, help="Path to RVC model (e.g., zeldabotw.pth)")
parser.add_argument("--index", required=True, help="Path to RVC index file (e.g., zeldabotw.index)")
parser.add_argument("--pitch_shift", type=int, default=0, help="Pitch shift value")
args = parser.parse_args()
convert_voice(args.input, args.output, args.model, args.index, args.pitch_shift)