Spaces:

amphion
/

singing_voice_conversion

Running on A10G

App Files Files Community

singing_voice_conversion / config /valle.json

RMSnow

add backend inference and inferface output

0883aa1 8 months ago

raw

history blame contribute delete

No virus

2.49 kB

	{
	"base_config": "config/tts.json",
	"model_type": "VALLE",
	"task_type": "tts",
	"dataset": [
	"libritts"
	],
	"preprocess": {
	"extract_phone": true,
	"phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
	"extract_acoustic_token": true,
	"acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
	"acoustic_token_dir": "acoutic_tokens",
	"use_text": false,
	"use_phone": true,
	"use_acoustic_token": true,
	"symbols_dict": "symbols.dict",
	"min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
	"max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
	"sampling_rate": 24000,
	},
	"model": {
	"text_token_num": 512,
	"audio_token_num": 1024,
	"decoder_dim": 1024, // embedding dimension of the decoder model
	"nhead": 16, // number of attention heads in the decoder layers
	"num_decoder_layers": 12, // number of decoder layers
	"norm_first": true, // pre or post Normalization.
	"add_prenet": false, // whether add PreNet after Inputs
	"prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
	"share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
	"nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
	"prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
	"num_quantizers": 8, // numbert of the audio quantization layers
	// "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
	},
	"train": {
	"ddp": false,
	"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
	"max_epoch": 20,
	"optimizer": "ScaledAdam",
	"scheduler": "Eden",
	"warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
	"base_lr": 0.05, // base learning rate."
	"valid_interval": 1000,
	"log_epoch_step": 1000,
	"save_checkpoint_stride": [
	1,
	1
	]
	}
	}