Eddycrack864 commited on
Commit
4a3768d
1 Parent(s): ce7683c

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model_bandit_plus_dnr_sdr_11.47.chpt filter=lfs diff=lfs merge=lfs -text
config_dnr_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['speech', 'music', 'effects']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - speech
44
+ - music
45
+ - effects
46
+ lr: 9.0e-05
47
+ patience: 2
48
+ reduce_factor: 0.95
49
+ target_instrument: null
50
+ num_epochs: 1000
51
+ num_steps: 1000
52
+ augmentation: false # enable augmentations by audiomentations and pedalboard
53
+ augmentation_type: simple1
54
+ use_mp3_compress: false # Deprecated
55
+ augmentation_mix: true # Mix several stems of the same type with some probability
56
+ augmentation_loudness: true # randomly change loudness of each stem
57
+ augmentation_loudness_type: 1 # Type 1 or 2
58
+ augmentation_loudness_min: 0.5
59
+ augmentation_loudness_max: 1.5
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: adam
64
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ inference:
68
+ batch_size: 1
69
+ dim_t: 256
70
+ num_overlap: 4
config_vocals_segm_models.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 8
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 0
22
+ instruments:
23
+ - vocals
24
+ - other
25
+ lr: 5.0e-05
26
+ patience: 2
27
+ reduce_factor: 0.95
28
+ target_instrument: null
29
+ num_epochs: 1000
30
+ num_steps: 2000
31
+ augmentation: false # enable augmentations by audiomentations and pedalboard
32
+ augmentation_type: simple1
33
+ use_mp3_compress: false # Deprecated
34
+ augmentation_mix: true # Mix several stems of the same type with some probability
35
+ augmentation_loudness: true # randomly change loudness of each stem
36
+ augmentation_loudness_type: 1 # Type 1 or 2
37
+ augmentation_loudness_min: 0.5
38
+ augmentation_loudness_max: 1.5
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adamw
43
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
44
+
45
+ inference:
46
+ batch_size: 1
47
+ dim_t: 512
48
+ num_overlap: 4
mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca8799531fe51c94172cc047226209ed48bf7d8c02e04671795a15d2a1c318af
3
+ size 913096801
model_bandit_plus_dnr_sdr_11.47.chpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c48284779f7d1258a6527d3aaa18a532d45c1f506e2dcc25d5ab179a8c5e2573
3
+ size 148891175
model_bs_roformer_ep_317_sdr_12.9755.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b84f37e8d444c8cb30c79d77f613a41c05868ff9c9ac6c7049c00aefae115aa
3
+ size 639331213
model_bs_roformer_ep_317_sdr_12.9755.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 16
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - Vocals
108
+ - Instrumental
109
+ lr: 5.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: Vocals
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ augmentation: false # enable augmentations by audiomentations and pedalboard
116
+ augmentation_type: simple1
117
+ use_mp3_compress: false # Deprecated
118
+ augmentation_mix: true # Mix several stems of the same type with some probability
119
+ augmentation_loudness: true # randomly change loudness of each stem
120
+ augmentation_loudness_type: 1 # Type 1 or 2
121
+ augmentation_loudness_min: 0.5
122
+ augmentation_loudness_max: 1.5
123
+ q: 0.95
124
+ coarse_loss_clip: true
125
+ ema_momentum: 0.999
126
+ optimizer: adam
127
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
128
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
129
+
130
+ inference:
131
+ batch_size: 1
132
+ dim_t: 801
133
+ num_overlap: 4
model_bs_roformer_ep_368_sdr_12.9628.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c94864adfb73bbb0ca58ec14d58dd0b364549e9fb61433ae51916f3e2f8d0b
3
+ size 639317465
model_bs_roformer_ep_368_sdr_12.9628.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+
102
+ training:
103
+ batch_size: 16
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - Vocals
108
+ - Instrumental
109
+ lr: 5.0e-05
110
+ patience: 2
111
+ reduce_factor: 0.95
112
+ target_instrument: Vocals
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ augmentation: false # enable augmentations by audiomentations and pedalboard
116
+ augmentation_type: simple1
117
+ use_mp3_compress: false # Deprecated
118
+ augmentation_mix: true # Mix several stems of the same type with some probability
119
+ augmentation_loudness: true # randomly change loudness of each stem
120
+ augmentation_loudness_type: 1 # Type 1 or 2
121
+ augmentation_loudness_min: 0.5
122
+ augmentation_loudness_max: 1.5
123
+ q: 0.95
124
+ coarse_loss_clip: true
125
+ ema_momentum: 0.999
126
+ optimizer: adam
127
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
128
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
129
+
130
+ inference:
131
+ batch_size: 1
132
+ dim_t: 801
133
+ num_overlap: 4
model_bs_roformer_ep_937_sdr_10.5309.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e825a03bc908cb04dbd88eddeefbf5147dd1cf1f95cebf453d9dbfabec494b
3
+ size 393068365
model_bs_roformer_ep_937_sdr_10.5309.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 4
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - Vocals
109
+ - Instrumental
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: Vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 1
137
+ dim_t: 512
138
+ num_overlap: 4
model_mel_band_roformer_crowd.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: True
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: False
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: False
40
+
41
+ training:
42
+ batch_size: 2
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - crowd
47
+ - other
48
+ lr: 1.0e-05
49
+ patience: 8
50
+ reduce_factor: 0.95
51
+ target_instrument: crowd
52
+ num_epochs: 1000
53
+ num_steps: 4032
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type: null
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: false
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+
68
+ inference:
69
+ batch_size: 1
70
+ dim_t: 256
71
+ num_overlap: 4
model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b9d0958e35b8ebfbe2afe69bbd5444e5ffe2f5d80ae0d583b833d2f3c0d139
3
+ size 1007816988
model_mel_band_roformer_ep_3005_sdr_11.4360.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0.1
22
+ ff_dropout: 0.1
23
+ flash_attn: True
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: False
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: False
40
+
41
+ training:
42
+ batch_size: 9
43
+ gradient_accumulation_steps: 8
44
+ grad_clip: 0
45
+ instruments:
46
+ - Vocals
47
+ - Instrumental
48
+ lr: 4.0e-05
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Vocals
52
+ num_epochs: 1000
53
+ num_steps: 1000
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type: simple1
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: true # Mix several stems of the same type with some probability
58
+ augmentation_loudness: true # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0.5
61
+ augmentation_loudness_max: 1.5
62
+ q: 0.95
63
+ coarse_loss_clip: true
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ inference:
70
+ batch_size: 1
71
+ dim_t: 801
72
+ num_overlap: 4
model_vocals_segm_models_sdr_9.77.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cb6e969309f96602318fcf5970a6973899db86e5fd9d8f9cf8f15bacdd299bb
3
+ size 863683537