File size: 2,410 Bytes
9893813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
{
    "base_config": "egs/tta/audioldm/exp_config_base.json",
    "dataset": [
      "AudioCaps"
    ],
    "preprocess": {
      // Specify the output root path to save the processed data 
      "processed_dir": "data",
      // For example: "/home/TTADataset/processed_data"

      // feature
      "use_spkid": false,
      "use_uv": false,
      "use_frame_pitch": false,
      "use_phone_pitch": false,
      "use_frame_energy": false,
      "use_phone_energy": false,
      "use_mel": false,
      "use_audio": false,
      "use_label": false,
      "use_one_hot": false,
      // feature for text to audio
      "use_caption": true,
      "use_melspec": true,
      "use_wav": false,
      // feature dir
      "melspec_dir": "mel",
      "wav_dir": "wav"
    },
    // Specify the output root path to save model ckpts and logs
    "log_dir": "ckpts/tta",
    // For example: "/home/TTADataset/processed_data/logs"

    // model
    "model": {
      "audioldm": {
        "image_size": 32,
        "in_channels": 4,
        "out_channels": 4,
        "model_channels": 256,
        "attention_resolutions": [4, 2, 1],
        "num_res_blocks": 2,
        "channel_mult": [1, 2, 4],
        "num_heads": 8,
        "use_spatial_transformer": true,
        "transformer_depth": 1,
        "context_dim": 768,
        "use_checkpoint": true,
        "legacy": false
      },
      "autoencoderkl": {
        "ch": 128,
        "ch_mult": [1,1,2,2,4],
        "num_res_blocks": 2,
        "in_channels": 1,
        "z_channels": 4,
        "out_ch": 1,
        "double_z": true
      },
      "noise_scheduler": {
        "num_train_timesteps": 1000,
        "beta_start": 0.00085,
        "beta_end": 0.012,
        "beta_schedule": "scaled_linear",
        "clip_sample": false,
        "steps_offset": 1,
        "set_alpha_to_one": false,
        "skip_prk_steps": true,
        "prediction_type": "epsilon"
      },
      "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
    },

    // train
    "train": {
      "adam": {
        "lr": 5.0e-5
      },
      "ddp": false,
      "random_seed": 12345,
      "batch_size": 12,
      "epochs": 50000,
      "max_steps": 1000000,
      "total_training_steps": 800000,
      "save_summary_steps": 1000,
      "save_checkpoints_steps": 5000,
      "valid_interval": 5000,
      "keep_checkpoint_max": 100
    }
  }