Spaces:

amphion
/

Text-to-Speech

Running

Text-to-Speech / config /vits.json

Upload 685 files

0d80816 11 months ago

2.32 kB

	{
	"base_config": "config/tts.json",
	"model_type": "VITS",
	"task_type": "tts",
	"preprocess": {
	"extract_phone": true,
	"extract_mel": true,
	"n_mel": 80,
	"fmin": 0,
	"fmax": null,
	"extract_linear_spec": true,
	"extract_audio": true,
	"use_linear": true,
	"use_mel": true,
	"use_audio": true,
	"use_text": false,
	"use_phone": true,
	"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
	"n_fft": 1024,
	"win_size": 1024,
	"hop_size": 256,
	"segment_size": 8192,
	"text_cleaners": [
	"english_cleaners"
	]
	},
	"model": {
	"text_token_num": 512,
	"inter_channels": 192,
	"hidden_channels": 192,
	"filter_channels": 768,
	"n_heads": 2,
	"n_layers": 6,
	"kernel_size": 3,
	"p_dropout": 0.1,
	"resblock": "1",
	"resblock_kernel_sizes": [
	3,
	7,
	11
	],
	"resblock_dilation_sizes": [
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	]
	],
	"upsample_rates": [
	8,
	8,
	2,
	2
	],
	"upsample_initial_channel": 512,
	"upsample_kernel_sizes": [
	16,
	16,
	4,
	4
	],
	"n_layers_q": 3,
	"use_spectral_norm": false,
	"n_speakers": 10, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
	"gin_channels": 256,
	"use_sdp": true
	},
	"train": {
	"fp16_run": true,
	"learning_rate": 2e-4,
	"betas": [
	0.8,
	0.99
	],
	"eps": 1e-9,
	"batch_size": 16,
	"lr_decay": 0.999875,
	// "segment_size": 8192,
	"init_lr_ratio": 1,
	"warmup_epochs": 0,
	"c_mel": 45,
	"c_kl": 1.0,
	"AdamW": {
	"betas": [
	0.8,
	0.99
	],
	"eps": 1e-9,
	}
	}
	}