Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

kotoba-speech / fam /llm /fast_inference.py

yuta0306

first commit

565faca 8 months ago

6.24 kB

	import os
	import shutil
	import tempfile
	import time
	from pathlib import Path

	import librosa
	import torch
	from huggingface_hub import snapshot_download

	from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
	from fam.llm.decoders import EncodecDecoder
	from fam.llm.fast_inference_utils import build_model, main
	from fam.llm.inference import (
	EncodecDecoder,
	InferenceConfig,
	Model,
	TiltedEncodec,
	TrainedBPETokeniser,
	get_cached_embedding,
	get_cached_file,
	get_enhancer,
	)
	from fam.llm.utils import (
	check_audio_file,
	get_default_dtype,
	get_device,
	normalize_text,
	)
	import argparse


	class TTS:
	def __init__(
	self, model_name: str = "kotoba-tech/kotoba-speech-v0.1", *, seed: int = 1337, output_dir: str = "outputs", first_model_path: str = None,
	):
	"""
	model_name (str): refers to the model identifier from the Hugging Face Model Hub (https://huggingface.co/kotoba-tech/)
	"""

	# NOTE: this needs to come first so that we don't change global state when we want to use
	# the torch.compiled-model.
	self._dtype = get_default_dtype()
	self._device = get_device()
	self._model_dir = snapshot_download(repo_id=model_name)
	self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
	self.output_dir = output_dir
	os.makedirs(self.output_dir, exist_ok=True)

	second_stage_ckpt_path = f"{self._model_dir}/second_stage.pt"
	config_second_stage = InferenceConfig(
	ckpt_path=second_stage_ckpt_path,
	num_samples=1,
	seed=seed,
	device=self._device,
	dtype=self._dtype,
	compile=False,
	init_from="resume",
	output_dir=self.output_dir,
	)
	data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
	self.llm_second_stage = Model(
	config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
	)
	self.enhancer = get_enhancer("df")

	self.precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[self._dtype]
	self.model, self.tokenizer, self.smodel, self.model_size = build_model(
	precision=self.precision,
	checkpoint_path=Path(f"{self._model_dir}/first_stage.pt"),
	spk_emb_ckpt_path=Path(f"{self._model_dir}/speaker_encoder.pt"),
	device=self._device,
	compile=True,
	compile_prefill=True,
	first_model_path=first_model_path,
	)


	def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.0, temperature=1.0) -> str:
	"""
	text: Text to speak
	spk_ref_path: Path to speaker reference file. Min. 30s of audio required. Supports both local paths & public URIs. Audio formats: wav, flac & mp3
	top_p: Top p for sampling applied to first-stage model. Range [0.9, 1.0] are good. This is a measure of speech stability - improves text following for a challenging speaker
	guidance_scale: Guidance scale [1.0, 3.0] for sampling. This is a measure of speaker similarity - how closely to match speaker identity and speech style.
	temperature: Temperature for sampling applied to both LLMs (first & second stage)

	returns: path to speech .wav file
	"""
	text = normalize_text(text)
	spk_ref_path = get_cached_file(spk_ref_path)
	check_audio_file(spk_ref_path)
	spk_emb = get_cached_embedding(
	spk_ref_path,
	self.smodel,
	).to(device=self._device, dtype=self.precision)

	start = time.time()
	# first stage LLM
	tokens = main(
	model=self.model,
	tokenizer=self.tokenizer,
	model_size=self.model_size,
	prompt=text,
	spk_emb=spk_emb,
	top_p=torch.tensor(top_p, device=self._device, dtype=self.precision),
	guidance_scale=torch.tensor(guidance_scale, device=self._device, dtype=self.precision),
	temperature=torch.tensor(temperature, device=self._device, dtype=self.precision),
	)
	text_ids, extracted_audio_ids = self.first_stage_adapter.decode([tokens])

	b_speaker_embs = spk_emb.unsqueeze(0)

	# second stage LLM + multi-band diffusion model
	wav_files = self.llm_second_stage(
	texts=[text],
	encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=self._device).unsqueeze(0)],
	speaker_embs=b_speaker_embs,
	batch_size=1,
	guidance_scale=None,
	top_p=None,
	top_k=200,
	temperature=1.0,
	max_new_tokens=None,
	)

	# enhance using deepfilternet
	wav_file = wav_files[0]
	with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
	self.enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
	shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
	print(f"\nSaved audio to {wav_file}.wav")

	# calculating real-time factor (RTF)
	time_to_synth_s = time.time() - start
	audio, sr = librosa.load(str(wav_file) + ".wav")
	duration_s = librosa.get_duration(y=audio, sr=sr)
	print(f"\nTotal time to synth (s): {time_to_synth_s}")
	print(f"Real-time factor: {time_to_synth_s / duration_s:.2f}")

	return str(wav_file) + ".wav"


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Split list into chunks")
	parser.add_argument("--in_file", default="/home/data/reazon_large-v2_denoise/large.jsonl", help="Name of the file")
	parser.add_argument("--text", type=str, default="This is a demo for text-to-speech.", help="input text")
	parser.add_argument("--spk_ref_path", type=str, default="assets/bria.mp3", help="speaker path")
	parser.add_argument("--first_model_path", type=str, default=None, help="path to the first model")
	args = parser.parse_args()
	tts = TTS(first_model_path=args.first_model_path)
	tts.synthesise(text=args.text, spk_ref_path=args.spk_ref_path)