Vyvo-Multilingual-v0.1
Multilingual (English + Japanese) voice-cloning TTS. Give a short reference clip and a target text; the model speaks the text in the reference voice.
- Backbone: Qwen3-0.6B
- Audio codec: kyutai/mimi (32 codebooks, 24 kHz)
- Use a reference clip for best results.
Usage
import soundfile as sf
import torch
import torchaudio
from transformers import (AutoModelForCausalLM, AutoTokenizer,
AutoFeatureExtractor, MimiModel)
REPO = "Vyvo/Vyvo-Multilingual-v0.1"
DEVICE = "cuda"
# Token layout (must match training)
BASE = 151669 # Qwen3 base vocab; audio ids start above it
NUM_CODEBOOKS = 32
CODEBOOK_SIZE = 2048
AUDIO_OFFSET = 10
SOS, EOS, SOH, EOH, SOA = 1, 2, 3, 4, 5 # special tokens, as offsets above BASE
# Load the model, tokenizer, and the Mimi codec
tokenizer = AutoTokenizer.from_pretrained(REPO)
model = AutoModelForCausalLM.from_pretrained(REPO, dtype=torch.bfloat16).to(DEVICE).eval()
mimi = MimiModel.from_pretrained("kyutai/mimi").to(DEVICE).eval()
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
def encode_reference(path):
# Load a wav and encode it to Mimi audio tokens (as LM token ids).
wav, sr = sf.read(path, dtype="float32", always_2d=False)
if wav.ndim > 1:
wav = wav.mean(axis=1)
target_sr = feature_extractor.sampling_rate
if sr != target_sr:
wav = torchaudio.functional.resample(
torch.from_numpy(wav).unsqueeze(0), sr, target_sr).squeeze(0).numpy()
inputs = feature_extractor(raw_audio=wav, sampling_rate=target_sr, return_tensors="pt")
codes = mimi.encode(inputs["input_values"].to(DEVICE),
num_quantizers=NUM_CODEBOOKS).audio_codes[0].cpu() # (codebooks, frames)
frame_interleaved = codes.transpose(0, 1).reshape(-1).tolist()
return [code + AUDIO_OFFSET + (i % NUM_CODEBOOKS) * CODEBOOK_SIZE + BASE
for i, code in enumerate(frame_interleaved)]
def build_prompt(reference_tokens, reference_text, target_text):
# [SOH] ref_text + target_text [eot] [EOH] [SOA] [SOS] <reference audio>
text_ids = tokenizer(reference_text + " " + target_text,
add_special_tokens=False).input_ids
head = [BASE + SOH] + text_ids + [tokenizer.eos_token_id,
BASE + EOH, BASE + SOA, BASE + SOS]
return head + reference_tokens
def decode_audio(generated_ids):
# Turn generated LM token ids back into a waveform; stop at first invalid.
codes = []
for i, token in enumerate(generated_ids):
value = token - BASE - AUDIO_OFFSET - (i % NUM_CODEBOOKS) * CODEBOOK_SIZE
if 0 <= value < CODEBOOK_SIZE:
codes.append(value)
else:
break
frames = len(codes) // NUM_CODEBOOKS
codes = torch.tensor(codes[:frames * NUM_CODEBOOKS]).view(frames, NUM_CODEBOOKS)
codes = codes.t().unsqueeze(0).to(DEVICE) # (1, codebooks, frames)
return mimi.decode(codes).audio_values.squeeze().cpu().float().numpy()
# --- Voice cloning ---
reference_wav = "reference.wav"
reference_text = "text spoken in the reference clip"
target_text = "Hello, this is a test of the text to speech model."
prompt = build_prompt(encode_reference(reference_wav), reference_text, target_text)
input_ids = torch.tensor([prompt], device=DEVICE)
output = model.generate(
input_ids,
attention_mask=torch.ones_like(input_ids),
max_new_tokens=9600,
min_new_tokens=960,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
eos_token_id=BASE + EOS,
pad_token_id=tokenizer.eos_token_id,
)
audio = decode_audio(output[0, input_ids.shape[1]:].tolist())
sf.write("output.wav", audio, mimi.config.sampling_rate)
- Downloads last month
- 232