Text-to-Speech / inference.py
zyingt's picture
support timbre confusion
8d06e90 verified
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from argparse import ArgumentParser
import os
from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
from models.tts.vits.vits_inference import VitsInference
from models.tts.valle.valle_inference import VALLEInference
from utils.util import load_config
import torch
def build_inference(args, cfg):
supported_inference = {
"FastSpeech2": FastSpeech2Inference,
"VITS": VitsInference,
"VALLE": VALLEInference,
}
inference_class = supported_inference[cfg.model_type]
inference = inference_class(args, cfg)
return inference
def cuda_relevant(deterministic=False):
torch.cuda.empty_cache()
# TF32 on Ampere and above
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.enabled = True
torch.backends.cudnn.allow_tf32 = True
# Deterministic
torch.backends.cudnn.deterministic = deterministic
torch.backends.cudnn.benchmark = not deterministic
torch.use_deterministic_algorithms(deterministic)
def build_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config",
type=str,
required=True,
help="JSON/YAML file for configurations.",
)
parser.add_argument(
"--dataset",
type=str,
help="convert from the source data",
default=None,
)
parser.add_argument(
"--testing_set",
type=str,
help="train, test, golden_test",
default="test",
)
parser.add_argument(
"--test_list_file",
type=str,
help="convert from the test list file",
default=None,
)
parser.add_argument(
"--speaker_name_1",
type=str,
default=None,
help="first speaker name for multi-speaker synthesis, for single-sentence mode only",
)
parser.add_argument(
"--speaker_name_2",
type=str,
default=None,
help="second speaker name for multi-speaker synthesis, for single-sentence mode only",
)
parser.add_argument(
"--alpha",
type=float,
default=0.5,
help="confusion degree, takes values within [0,1]",
)
parser.add_argument(
"--text",
help="Text to be synthesized.",
type=str,
default="",
)
parser.add_argument(
"--vocoder_dir",
type=str,
default=None,
help="Vocoder checkpoint directory. Searching behavior is the same as "
"the acoustics one.",
)
parser.add_argument(
"--acoustics_dir",
type=str,
default=None,
help="Acoustic model checkpoint directory. If a directory is given, "
"search for the latest checkpoint dir in the directory. If a specific "
"checkpoint dir is given, directly load the checkpoint.",
)
parser.add_argument(
"--checkpoint_path",
type=str,
default=None,
help="Acoustic model checkpoint directory. If a directory is given, "
"search for the latest checkpoint dir in the directory. If a specific "
"checkpoint dir is given, directly load the checkpoint.",
)
parser.add_argument(
"--mode",
type=str,
choices=["batch", "single"],
required=True,
help="Synthesize a whole dataset or a single sentence",
)
parser.add_argument(
"--log_level",
type=str,
default="warning",
help="Logging level. Default: warning",
)
parser.add_argument(
"--pitch_control",
type=float,
default=1.0,
help="control the pitch of the whole utterance, larger value for higher pitch",
)
parser.add_argument(
"--energy_control",
type=float,
default=1.0,
help="control the energy of the whole utterance, larger value for larger volume",
)
parser.add_argument(
"--duration_control",
type=float,
default=1.0,
help="control the speed of the whole utterance, larger value for slower speaking rate",
)
parser.add_argument(
"--output_dir",
type=str,
default=None,
help="Output dir for saving generated results",
)
return parser
def main(args_list):
# Parse arguments and config
args = build_parser().parse_args(args_list)
cfg = load_config(args.config)
# CUDA settings
cuda_relevant()
# Build inference
inferencer = build_inference(args, cfg)
# Run inference
output_audio = inferencer.inference()
return output_audio
if __name__ == "__main__":
main()