|
{ |
|
"audio_embedding_dim": 1024, |
|
"audio_embedding_dropout": 0.0, |
|
"audio_max_length": 16.0, |
|
"audio_min_length": 1.0, |
|
"audio_pad_token": 2050, |
|
"audio_positional_embedding_dropout": 0.0, |
|
"audio_vocab_size": 2048, |
|
"batch_size": 100, |
|
"clipping_update_period": 1000, |
|
"codebook_weight": "[2,1,1,1]", |
|
"d_model": 1024, |
|
"dataset": "gigaspeech", |
|
"dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl", |
|
"drop_long": 1, |
|
"dynamic_batching": 1, |
|
"early_stop_step": 3200, |
|
"early_stop_threshold": -1.0, |
|
"empty_token": 2048, |
|
"encodec_folder_name": "encodec_16khz_4codebooks", |
|
"encodec_sr": 50, |
|
"eog": 2049, |
|
"eos": 2051, |
|
"exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M", |
|
"gradient_accumulation_steps": 24, |
|
"gradient_clip_val": 1.0, |
|
"load_model_from": "./pretrained_models/giga330M.pth", |
|
"lr": 1e-05, |
|
"manifest_name": "manifest_large16khz_lessambi", |
|
"mask_len_max": 600, |
|
"mask_len_min": 1, |
|
"mask_sample_dist": "poisson1", |
|
"max_mask_portion": 0.9, |
|
"max_n_spans": 3, |
|
"max_num_tokens": 20000, |
|
"min_gap": 5, |
|
"n_codebooks": 4, |
|
"n_special": 4, |
|
"nhead": 16, |
|
"num_buckets": 10, |
|
"num_decoder_layers": 24, |
|
"num_epochs": 10, |
|
"num_steps": 500000, |
|
"num_workers": 8, |
|
"optimizer_name": "AdamW", |
|
"pad_x": 0, |
|
"phn2num": { |
|
"!": 17, |
|
"\"": 97, |
|
",": 64, |
|
".": 77, |
|
"1": 80, |
|
":": 93, |
|
";": 81, |
|
"<MUSIC>": 39, |
|
"<NOISE>": 52, |
|
"<OTHER>": 60, |
|
"<SIL>": 53, |
|
"?": 78, |
|
"_": 15, |
|
"a\u026a": 48, |
|
"a\u026a\u0259": 56, |
|
"a\u026a\u025a": 2, |
|
"a\u028a": 36, |
|
"b": 20, |
|
"d": 72, |
|
"d\u0292": 57, |
|
"e": 85, |
|
"e\u026a": 6, |
|
"f": 69, |
|
"h": 14, |
|
"i": 27, |
|
"i\u0259": 42, |
|
"i\u02d0": 68, |
|
"i\u02d0\u02d0": 51, |
|
"j": 67, |
|
"k": 41, |
|
"kh": 84, |
|
"l": 63, |
|
"m": 9, |
|
"n": 23, |
|
"n\u02b2": 8, |
|
"o": 86, |
|
"o\u028a": 25, |
|
"o\u02d0": 74, |
|
"o\u02d0\u0279": 40, |
|
"p": 34, |
|
"q": 96, |
|
"r": 79, |
|
"s": 66, |
|
"t": 73, |
|
"t\u0255": 87, |
|
"t\u0283": 75, |
|
"t\u02b0": 94, |
|
"u": 1, |
|
"u\u02d0": 47, |
|
"v": 31, |
|
"w": 19, |
|
"x": 4, |
|
"z": 22, |
|
"\u00a1": 98, |
|
"\u00ab": 88, |
|
"\u00bb": 89, |
|
"\u00bf": 95, |
|
"\u00e6": 32, |
|
"\u00e6\u00e6": 50, |
|
"\u00e7": 10, |
|
"\u00f0": 7, |
|
"\u014b": 58, |
|
"\u0250": 70, |
|
"\u0250\u0250": 71, |
|
"\u0251": 61, |
|
"\u0251\u02d0": 0, |
|
"\u0251\u02d0\u0279": 44, |
|
"\u0252": 83, |
|
"\u0254": 3, |
|
"\u0254\u026a": 13, |
|
"\u0254\u02d0": 29, |
|
"\u0254\u02d0\u0279": 33, |
|
"\u0259": 54, |
|
"\u0259l": 16, |
|
"\u0259\u028a": 90, |
|
"\u025a": 35, |
|
"\u025b": 18, |
|
"\u025b\u0279": 11, |
|
"\u025b\u02d0": 82, |
|
"\u025c\u02d0": 21, |
|
"\u0261": 49, |
|
"\u0261\u02b2": 37, |
|
"\u026a": 65, |
|
"\u026a\u0279": 76, |
|
"\u026a\u02d0": 100, |
|
"\u026c": 46, |
|
"\u026f": 91, |
|
"\u0279": 5, |
|
"\u027e": 24, |
|
"\u0283": 26, |
|
"\u028a": 43, |
|
"\u028a\u0279": 28, |
|
"\u028c": 38, |
|
"\u0292": 55, |
|
"\u0294": 59, |
|
"\u0303": 45, |
|
"\u0329": 12, |
|
"\u03b8": 30, |
|
"\u1d7b": 62, |
|
"\u2014": 99, |
|
"\u2026": 92 |
|
}, |
|
"phn_folder_name": "phonemes", |
|
"precision": "float16", |
|
"print_every_n_steps": 400, |
|
"pseudo_epoch_size": 3000, |
|
"reduce_lr_start_epoch": 4, |
|
"reduce_lr_start_step": 3000, |
|
"reduced_eog": 1, |
|
"resume": false, |
|
"seed": 1, |
|
"shuffle_mask_embedding": 0, |
|
"special_first": 0, |
|
"tb_write_every_n_steps": 100, |
|
"text_embedding_dropout": 0.0, |
|
"text_max_length": 400, |
|
"text_min_length": 10.0, |
|
"text_pad_token": 120, |
|
"text_positional_embedding_dropout": 0.0, |
|
"text_vocab_size": 120, |
|
"trm_dropout": 0.0, |
|
"val_every_n_steps": 1600, |
|
"val_max_num_tokens": 6000, |
|
"warmup_fraction": 0.1, |
|
"weight_decay": 0.0 |
|
} |