File size: 1,422 Bytes
9893813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
{
    "base_config": "egs/tta/autoencoderkl/exp_config_base.json",
    "dataset": [
      "AudioCaps"
    ],
    "preprocess": {
      // Specify the output root path to save the processed data 
      "processed_dir": "data",

      // feature
      "use_spkid": false,
      "use_uv": false,
      "use_frame_pitch": false,
      "use_phone_pitch": false,
      "use_frame_energy": false,
      "use_phone_energy": false,
      "use_mel": false,
      "use_audio": false,
      "use_label": false,
      "use_one_hot": false,
      // feature for text to audio
      "use_caption": true,
      "use_melspec": true,
      "use_wav": false,
      // feature dir
      "melspec_dir": "mel",
      "wav_dir": "wav"
    },
    // Specify the output root path to save model ckpts and logs
    "log_dir": "ckpts/tta",

    "model": {
      "autoencoderkl": {
        "ch": 128,
        "ch_mult": [1,2,2,4],
        "num_res_blocks": 2,
        "in_channels": 1,
        "z_channels": 4,
        "out_ch": 1,
        "double_z": true
      }
    },
    // train
    "train": {
      "adam": {
        "lr": 4.0e-5
      },
      "ddp": false,
      "random_seed": 12345,
      "batch_size": 12,
      "epochs": 50000,
      "max_steps": 1000000,
      "total_training_steps": 800000,
      "save_summary_steps": 1000,
      "save_checkpoints_steps": 5000,
      "valid_interval": 5000,
      "keep_checkpoint_max": 100
    }
  }