jbetker commited on
Commit
55cc85f
1 Parent(s): 05a53d0

Initial commit

Browse files
noisy_audio_clips_classifier.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6beb3ef287b2944f302be39821d13ca638f4a43278139707fea8e0a594f1b03
3
+ size 169130277
test_noisy_audio_clips_classifier.yml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### general settings
2
+ name: test_noisy_audio_clips_classifier
3
+ use_tb_logger: true
4
+ model: extensibletrainer
5
+ distortion: sr
6
+ scale: 1
7
+ gpu_ids: [0]
8
+ start_step: 0
9
+ checkpointing_enabled: true
10
+ fp16: false
11
+ wandb: true
12
+
13
+ datasets:
14
+ test:
15
+ name: clips_val
16
+ n_workers: 1
17
+ batch_size: 16
18
+ mode: unsupervised_audio
19
+ path: [Z:\split\garbage-2\podcast_dump0_garbage]
20
+ cache_path: Z:\split\garbage-2\podcast_dump0_garbage_cache.pth
21
+ sampling_rate: 22050
22
+ do_augmentation: false
23
+ pad_to_samples: 65536
24
+ extra_samples: 0
25
+
26
+ networks:
27
+ classifier:
28
+ type: generator
29
+ which_model_G: mini_audio_encoder_classifier
30
+ kwargs:
31
+ classes: 5
32
+ spec_dim: 80
33
+ embedding_dim: 1024
34
+ base_channels: 128
35
+ depth: 3
36
+ resnet_blocks: 2
37
+ attn_blocks: 8
38
+ num_attn_heads: 4
39
+ dropout: .1
40
+
41
+ #### path
42
+ path:
43
+ pretrain_model_classifier: noisy_audio_clips_classifier.pth
44
+ strict_load: true
45
+ #resume_state: ../experiments/train_noisy_audio_clips_classifier/training_state/51000.state
46
+
47
+ steps:
48
+ classifier:
49
+ training: classifier
50
+
51
+ optimizer: adamw
52
+ optimizer_params:
53
+ lr: !!float 3e-4
54
+ weight_decay: !!float 1e-5
55
+ beta1: 0.9
56
+ beta2: 0.9999
57
+
58
+ clip_grad_eps: 1.0
59
+
60
+ injectors:
61
+ to_mel:
62
+ type: mel_spectrogram
63
+ in: clip
64
+ out: actual_mel
65
+ pad:
66
+ type: pad
67
+ multiple: 16
68
+ in: actual_mel
69
+ out: inp_mel
70
+ gen_inj_train:
71
+ type: generator
72
+ generator: classifier
73
+ in: inp_mel
74
+ out: logits
75
+ losses:
76
+ classification_loss:
77
+ type: crossentropy
78
+ weight: 1.0
79
+ logits: logits
80
+ labels: label
81
+
82
+ train:
83
+ niter: 500000
84
+ warmup_iter: -1
85
+ mega_batch_factor: 1
86
+ val_freq: 2000
87
+
88
+ # Default LR scheduler options
89
+ default_lr_scheme: MultiStepLR
90
+ gen_lr_steps: [ 20000, 40000, 60000 ]
91
+ lr_gamma: 0.2
92
+
93
+ eval:
94
+ path_key: path
95
+ classifier_logits_key: logits
96
+ output_dir: D:\tmp\podcasts_split
97
+ # Derived from audio_with_noise_dataset
98
+ output_labels: [fine, env_noise, music, two_voices, reverb]
99
+
100
+ logger:
101
+ print_freq: 30
102
+ save_checkpoint_freq: 1000
103
+ visuals: []
104
+ is_mel_spectrogram: true
105
+ visual_debug_rate: 500
train_voice_voice_clip.yml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### general settings
2
+ name: train_voice_voice_clip
3
+ use_tb_logger: true
4
+ gpu_ids: [0]
5
+ start_step: 0
6
+ fp16: false
7
+ checkpointing_enabled: true
8
+ wandb: false
9
+
10
+ datasets:
11
+ train:
12
+ name: clips
13
+ n_workers: 4
14
+ batch_size: 512
15
+ mode: unsupervised_audio
16
+ path: [/y/clips,
17
+ /y/bigasr_dataset/libritts/train-clean-100, /y/bigasr_dataset/libritts/train-clean-360,
18
+ /y/bigasr_dataset/libritts/train-other-500, /y/bigasr_dataset/ljspeech/wavs]
19
+ exclusions: [/y/clips/books1-hifreq.txt, /y/clips/podcasts-0-hifreq.txt,
20
+ /y/clips/books2-hifreq.txt, /y/bigasr_dataset/libritts-hifreq.txt]
21
+ cache_path: /y/clips-cache-hifreq.pth
22
+ sampling_rate: 22050
23
+ do_augmentation: false
24
+ pad_to_samples: 80000
25
+ resample_clip: false
26
+ min_length: 40000
27
+ debug_loading_failures: false
28
+ val:
29
+ name: clips_val
30
+ n_workers: 1
31
+ batch_size: 512
32
+ mode: unsupervised_audio
33
+ path: [/h/libritts/test-clean]
34
+ cache_path: /h/libritts/test-clean/cache.pth
35
+ sampling_rate: 22050
36
+ do_augmentation: false
37
+ pad_to_samples: 80000
38
+ resample_clip: false
39
+ min_length: 40000
40
+ debug_loading_failures: false
41
+
42
+ networks:
43
+ clip:
44
+ type: generator
45
+ which_model_G: voice_to_voice_clip
46
+ kwargs:
47
+ encoder_output: 512
48
+
49
+ #### path
50
+ path:
51
+ strict_load: true
52
+ #resume_state: ../experiments/train_voice_voice_clip/training_state/56000.state
53
+ pretrain_model_clip: voice_voice_clip.pth
54
+
55
+ steps:
56
+ clip_train:
57
+ training: clip
58
+ loss_log_buffer: 250
59
+
60
+ # Generally follows the recipe from the DALLE paper.
61
+ optimizer: adamw
62
+ optimizer_params:
63
+ lr: !!float 1e-4
64
+ weight_decay: 0
65
+ beta1: 0.9
66
+ beta2: 0.99
67
+ clip_grad_eps: 4 # TODO: remove clipping after warmup steps.
68
+
69
+ injectors:
70
+ # Speech only
71
+ speech_to_mel:
72
+ type: torch_mel_spectrogram
73
+ mel_norm_file: ../experiments/clips_mel_norms.pth
74
+ in: clip
75
+ out: speech_mel
76
+ forward:
77
+ type: generator
78
+ generator: clip
79
+ in: [speech_mel, clip_lengths]
80
+ out: clip_loss
81
+ losses:
82
+ clip_loss_ce:
83
+ type: direct
84
+ weight: 1
85
+ key: clip_loss
86
+
87
+
88
+ train:
89
+ niter: 500000
90
+ warmup_iter: -1
91
+ mega_batch_factor: 1
92
+ ema_rate: .999
93
+ val_freq: 500
94
+
95
+ default_lr_scheme: MultiStepLR
96
+ gen_lr_steps: [ 20000, 40000, 60000 ]
97
+ lr_gamma: 0.2
98
+ warmup_steps: 1000
99
+ #force_lr: !!float 4e-5
100
+
101
+ eval:
102
+ pure: true
103
+
104
+ logger:
105
+ print_freq: 10
106
+ save_checkpoint_freq: 500
107
+ visuals: []
108
+ is_mel_spectrogram: true
109
+ visual_debug_rate: 100
voice_voice_clip.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19792b3f75db1d8ad2c2553bbe82074396fd74d020266f30a45364c540b20b72
3
+ size 26307225