tortoise-filtering-models / test_noisy_audio_clips_classifier.yml
jbetker's picture
Initial commit
55cc85f
#### general settings
name: test_noisy_audio_clips_classifier
use_tb_logger: true
model: extensibletrainer
distortion: sr
scale: 1
gpu_ids: [0]
start_step: 0
checkpointing_enabled: true
fp16: false
wandb: true
datasets:
test:
name: clips_val
n_workers: 1
batch_size: 16
mode: unsupervised_audio
path: [Z:\split\garbage-2\podcast_dump0_garbage]
cache_path: Z:\split\garbage-2\podcast_dump0_garbage_cache.pth
sampling_rate: 22050
do_augmentation: false
pad_to_samples: 65536
extra_samples: 0
networks:
classifier:
type: generator
which_model_G: mini_audio_encoder_classifier
kwargs:
classes: 5
spec_dim: 80
embedding_dim: 1024
base_channels: 128
depth: 3
resnet_blocks: 2
attn_blocks: 8
num_attn_heads: 4
dropout: .1
#### path
path:
pretrain_model_classifier: noisy_audio_clips_classifier.pth
strict_load: true
#resume_state: ../experiments/train_noisy_audio_clips_classifier/training_state/51000.state
steps:
classifier:
training: classifier
optimizer: adamw
optimizer_params:
lr: !!float 3e-4
weight_decay: !!float 1e-5
beta1: 0.9
beta2: 0.9999
clip_grad_eps: 1.0
injectors:
to_mel:
type: mel_spectrogram
in: clip
out: actual_mel
pad:
type: pad
multiple: 16
in: actual_mel
out: inp_mel
gen_inj_train:
type: generator
generator: classifier
in: inp_mel
out: logits
losses:
classification_loss:
type: crossentropy
weight: 1.0
logits: logits
labels: label
train:
niter: 500000
warmup_iter: -1
mega_batch_factor: 1
val_freq: 2000
# Default LR scheduler options
default_lr_scheme: MultiStepLR
gen_lr_steps: [ 20000, 40000, 60000 ]
lr_gamma: 0.2
eval:
path_key: path
classifier_logits_key: logits
output_dir: D:\tmp\podcasts_split
# Derived from audio_with_noise_dataset
output_labels: [fine, env_noise, music, two_voices, reverb]
logger:
print_freq: 30
save_checkpoint_freq: 1000
visuals: []
is_mel_spectrogram: true
visual_debug_rate: 500