File size: 1,938 Bytes
9958d8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
{
  "seed": 1,
  "precision": "float16",
  "num_workers": 8,
  "resume": false,
  "tb_write_every_n_steps": 100,
  "print_every_n_steps": 400,
  "val_every_n_steps": 1600,
  "lr": 1e-05,
  "batch_size": 100,
  "weight_decay": 0.0,
  "warmup_fraction": 0.1,
  "num_epochs": 10,
  "num_steps": 500000,
  "gradient_accumulation_steps": 24,
  "gradient_clip_val": 1.0,
  "early_stop_step": 3200,
  "early_stop_threshold": -1.0,
  "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
  "dataset": "gigaspeech",
  "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
  "pseudo_epoch_size": 3000,
  "phn_folder_name": "phonemes",
  "encodec_folder_name": "encodec_16khz_4codebooks",
  "manifest_name": "manifest_large16khz_lessambi",
  "pad_x": 0,
  "max_num_tokens": 20000,
  "val_max_num_tokens": 6000,
  "num_buckets": 10,
  "dynamic_batching": 1,
  "audio_max_length": 16.0,
  "audio_min_length": 1.0,
  "text_max_length": 400,
  "text_min_length": 10.0,
  "encodec_sr": 50,
  "mask_len_min": 1,
  "mask_len_max": 600,
  "drop_long": 1,
  "eos": 2051,
  "reduced_eog": 1,
  "special_first": 0,
  "n_special": 4,
  "codebook_weight": "[2,1,1,1]",
  "empty_token": 2048,
  "optimizer_name": "AdamW",
  "reduce_lr_start_step": 3000,
  "reduce_lr_start_epoch": 4,
  "clipping_update_period": 1000,
  "max_mask_portion": 0.9,
  "max_n_spans": 3,
  "shuffle_mask_embedding": 0,
  "mask_sample_dist": "poisson1",
  "min_gap": 5,
  "n_codebooks": 4,
  "text_vocab_size": 120,
  "text_pad_token": 120,
  "audio_vocab_size": 2048,
  "eog": 2049,
  "audio_pad_token": 2050,
  "d_model": 1024,
  "audio_embedding_dim": 1024,
  "text_embedding_dropout": 0.0,
  "audio_embedding_dropout": 0.0,
  "text_positional_embedding_dropout": 0.0,
  "audio_positional_embedding_dropout": 0.0,
  "trm_dropout": 0.0,
  "nhead": 16,
  "num_decoder_layers": 24,
  "load_model_from": "./pretrained_models/giga330M.pth"
}