Upload 3 files

Browse files

Files changed (3) hide show

configs/audiolcm.yaml +130 -0
configs/autoencoder1d.yaml +74 -0
configs/teacher.yaml +121 -0

configs/audiolcm.yaml ADDED Viewed

	@@ -0,0 +1,130 @@

+model:
+  base_learning_rate: 3.0e-06
+  target: ldm.models.diffusion.lcm_audio.LCM_audio
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    mel_dim: 20
+    mel_length: 312
+    channels: 0
+    cond_stage_trainable: False
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    use_lcm: True
+    num_ddim_timesteps: 50
+    w_min: 4
+    w_max: 12
+    ckpt_path: ./ckpt/maa2.ckpt
+    use_ema: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 10000
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
+      params:
+        in_channels: 20
+        context_dim: 1024
+        hidden_size: 576
+        num_heads: 8
+        depth: 4
+        max_len: 1000
+    first_stage_config:
+      target: ldm.models.autoencoder1d.AutoencoderKL
+      params:
+        embed_dim: 20
+        monitor: val/rec_loss
+        ckpt_path: ./model/AutoencoderKL/epoch=000032.ckpt
+        ddconfig:
+          double_z: true
+          in_channels: 80
+          out_ch: 80
+          z_channels: 20
+          kernel_size: 5
+          ch: 384
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_layers:
+          - 3
+          down_layers:
+          - 0
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
+      params:
+        weights_path: ./model/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        sample_rate: 16000
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: ./vocoder/logs/bigvnat16k93.5w
+  trainer:
+    benchmark: True
+    gradient_clip_val: 1.0
+    replace_sampler_ddp: false
+    max_epochs: 100
+  modelcheckpoint:
+    params:
+      monitor: epoch
+      mode: max
+      # every_n_train_steps: 2000
+      save_top_k: 100
+      every_n_epochs: 3
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 32
+    spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
+    mel_num: 80
+    train:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
+      params:
+        specs_dataset_cfg:
+    validation:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
+      params:
+        specs_dataset_cfg:
+test_dataset:
+  target: ldm.data.tsvdataset.TSVDatasetStruct
+  params:
+    tsv_path: audiocaps_test_16000_struct.tsv
+    spec_crop_len: 624

configs/autoencoder1d.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+model:
+  base_learning_rate: 4.5e-06
+  target: ldm.models.autoencoder1d.AutoencoderKL
+  params:
+    embed_dim: 20
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      in_channels: 80
+      out_ch: 80
+      z_channels: 20
+      kernel_size: 5
+      ch: 384
+      ch_mult:
+      - 1
+      - 2
+      - 4
+      num_res_blocks: 2
+      attn_layers:
+      - 3
+      down_layers:
+      - 0
+      dropout: 0.0
+    lossconfig:
+      target: ldm.modules.losses_audio.contperceptual.LPAPSWithDiscriminator
+      params:
+        disc_start: 80001
+        perceptual_weight: 0.0
+        kl_weight: 1.0e-06
+        disc_weight: 0.5
+        disc_in_channels: 1
+        disc_loss: mse
+        disc_factor: 2
+        disc_conditional: false
+        r1_reg_weight: 3
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        rescale: false
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
+  trainer:
+    sync_batchnorm: false # not working with r1_regularization
+    strategy: ddp
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 16
+    spec_dir_path: ldm/data/tsv_dirs/full_data/V1_new
+    mel_num: 80
+    spec_len: 624
+    spec_crop_len: 624
+    train:
+      target: ldm.data.joinaudiodataset_624.JoinSpecsTrain
+      params:
+        specs_dataset_cfg: null
+    validation:
+      target: ldm.data.joinaudiodataset_624.JoinSpecsValidation
+      params:
+        specs_dataset_cfg: null

configs/teacher.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+model:
+  base_learning_rate: 3.0e-06
+  target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    mel_dim: 20
+    mel_length: 312
+    channels: 0
+    cond_stage_trainable: True
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    use_ema: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 10000
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
+      params:
+        in_channels: 20
+        context_dim: 1024
+        hidden_size: 576
+        num_heads: 8
+        depth: 4
+        max_len: 1000
+    first_stage_config:
+      target: ldm.models.autoencoder1d.AutoencoderKL
+      params:
+        embed_dim: 20
+        monitor: val/rec_loss
+        ckpt_path: logs/trainae/ckpt/epoch=000032.ckpt
+        ddconfig:
+          double_z: true
+          in_channels: 80
+          out_ch: 80
+          z_channels: 20
+          kernel_size: 5
+          ch: 384
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_layers:
+          - 3
+          down_layers:
+          - 0
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
+      params:
+        weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        sample_rate: 16000
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
+  trainer:
+    benchmark: True
+    gradient_clip_val: 1.0
+    replace_sampler_ddp: false
+  modelcheckpoint:
+    params:
+      monitor: epoch
+      mode: max
+      save_top_k: 10
+      every_n_epochs: 5
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 32
+    main_spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
+    other_spec_dir_path: 'ldm/data/tsv_dirs/full_data/V2'
+    mel_num: 80
+    train:
+      target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsTrain
+      params:
+        specs_dataset_cfg:
+    validation:
+      target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsValidation
+      params:
+        specs_dataset_cfg:
+test_dataset:
+  target: ldm.data.tsvdataset.TSVDatasetStruct
+  params:
+    tsv_path: musiccap.tsv
+    spec_crop_len: 624