{ "args": null, "audio_embedding_dim": 2048, "audio_embedding_dropout": 0.0, "audio_max_length": 20.0, "audio_min_length": 2.0, "audio_pad_token": 2050, "audio_positional_embedding_dropout": 0.0, "audio_vocab_size": "2048", "batch_size": 100, "clipping_update_period": 1000, "codebook_weight": "[2,1,1,1]", "d_model": 2048, "dataset": "gigaspeech", "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl", "drop_long": 1, "dynamic_batching": 1, "early_stop_step": 3200, "early_stop_threshold": -1.0, "empty_token": 2048, "encodec_folder_name": "encodec_16khz_4codebooks", "encodec_sr": 50, "eog": 2049, "eos": 2051, "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_830M", "gradient_accumulation_steps": 24, "gradient_clip_val": 1.0, "load_model_from": null, "lr": 1e-05, "manifest_name": "manifest_large16khz_lessambi", "mask_len_max": 600, "mask_len_min": 1, "mask_sample_dist": "poisson1", "max_mask_portion": 0.9, "max_n_spans": 3, "max_num_tokens": 50000, "min_gap": 5, "n_codebooks": 4, "n_special": 4, "nhead": 16, "num_buckets": 10, "num_decoder_layers": 16, "num_epochs": 10, "num_steps": 500000, "num_workers": 8, "optimizer_name": "AdamW", "pad_x": 0, "phn2num": { "!": 17, "\"": 97, ",": 64, ".": 77, "1": 80, ":": 93, ";": 81, "": 39, "": 52, "": 60, "": 53, "?": 78, "_": 15, "a\u026a": 48, "a\u026a\u0259": 56, "a\u026a\u025a": 2, "a\u028a": 36, "b": 20, "d": 72, "d\u0292": 57, "e": 85, "e\u026a": 6, "f": 69, "h": 14, "i": 27, "i\u0259": 42, "i\u02d0": 68, "i\u02d0\u02d0": 51, "j": 67, "k": 41, "kh": 84, "l": 63, "m": 9, "n": 23, "n\u02b2": 8, "o": 86, "o\u028a": 25, "o\u02d0": 74, "o\u02d0\u0279": 40, "p": 34, "q": 96, "r": 79, "s": 66, "t": 73, "t\u0255": 87, "t\u0283": 75, "t\u02b0": 94, "u": 1, "u\u02d0": 47, "v": 31, "w": 19, "x": 4, "z": 22, "\u00a1": 98, "\u00ab": 88, "\u00bb": 89, "\u00bf": 95, "\u00e6": 32, "\u00e6\u00e6": 50, "\u00e7": 10, "\u00f0": 7, "\u014b": 58, "\u0250": 70, "\u0250\u0250": 71, "\u0251": 61, "\u0251\u02d0": 0, "\u0251\u02d0\u0279": 44, "\u0252": 83, "\u0254": 3, "\u0254\u026a": 13, "\u0254\u02d0": 29, "\u0254\u02d0\u0279": 33, "\u0259": 54, "\u0259l": 16, "\u0259\u028a": 90, "\u025a": 35, "\u025b": 18, "\u025b\u0279": 11, "\u025b\u02d0": 82, "\u025c\u02d0": 21, "\u0261": 49, "\u0261\u02b2": 37, "\u026a": 65, "\u026a\u0279": 76, "\u026a\u02d0": 100, "\u026c": 46, "\u026f": 91, "\u0279": 5, "\u027e": 24, "\u0283": 26, "\u028a": 43, "\u028a\u0279": 28, "\u028c": 38, "\u0292": 55, "\u0294": 59, "\u0303": 45, "\u0329": 12, "\u03b8": 30, "\u1d7b": 62, "\u2014": 99, "\u2026": 92 }, "phn_folder_name": "phonemes", "precision": "float16", "print_every_n_steps": 800, "pseudo_epoch_size": 3000, "reduce_lr_start_epoch": 4, "reduce_lr_start_step": 3000, "reduced_eog": 1, "resume": false, "seed": 1, "shuffle_mask_embedding": 0, "special_first": 0, "tb_write_every_n_steps": 100, "text_embedding_dropout": 0.0, "text_max_length": 400, "text_min_length": 10.0, "text_pad_token": 120, "text_positional_embedding_dropout": 0.0, "text_vocab_size": 120, "trm_dropout": 0.0, "val_every_n_steps": 3200, "val_max_num_tokens": 6000, "warmup_fraction": 0.1, "weight_decay": 0.0 }