swdq
/

Music-Source-Separation-Training_models

Model card Files Files and versions Community

swdq commited on 26 days ago

Commit

f55242c

•

1 Parent(s): 02afbd6

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
data/241030_083041.wav +0 -0
data/241030_083112.wav +0 -0
data/BS Roformer/config.yaml +3 -0
data/BS Roformer/model/model_bs_roformer_ep_937_sdr_10.5309.ckpt +3 -0
data/BS Roformer/model/model_bs_roformer_ep_937_sdr_10.5309.yaml +138 -0
data/HTDemucs4 FT Vocals/config.yaml +3 -0
data/HTDemucs4 FT Vocals/model/04573f0d-f3cf25b2.th +3 -0
data/HTDemucs4 FT Vocals/model/config_musdb18_htdemucs.yaml +119 -0
data/MelBand Roformer (anvuew edition)/config.yaml +3 -0
data/MelBand Roformer (anvuew edition)/model/dereverb_mel_band_roformer_anvuew.yaml +76 -0
data/MelBand Roformer (anvuew edition)/model/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/HTDemucs4[[:space:]]FT[[:space:]]Vocals/model/04573f0d-f3cf25b2.th filter=lfs diff=lfs merge=lfs -text

data/241030_083041.wav ADDED Viewed

Binary file (269 kB). View file

data/241030_083112.wav ADDED Viewed

Binary file (372 kB). View file

data/BS Roformer/config.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+model_type: 'bs_roformer'
+config_path: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\BS Roformer\model\model_bs_roformer_ep_937_sdr_10.5309.yaml'
+start_checkpoint: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\BS Roformer\model\model_bs_roformer_ep_937_sdr_10.5309.ckpt'

data/BS Roformer/model/model_bs_roformer_ep_937_sdr_10.5309.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2e825a03bc908cb04dbd88eddeefbf5147dd1cf1f95cebf453d9dbfabec494b
+size 393068365

data/BS Roformer/model/model_bs_roformer_ep_937_sdr_10.5309.yaml ADDED Viewed

	@@ -0,0 +1,138 @@

+audio:
+  chunk_size: 131584
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 512
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  dim: 384
+  depth: 12
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  linear_transformer_depth: 0
+  freqs_per_bands: !!python/tuple
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 128
+    - 129
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0.1
+  ff_dropout: 0.1
+  flash_attn: true
+  dim_freqs_in: 1025
+  stft_n_fft: 2048
+  stft_hop_length: 512
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - vocals
+  - other
+  lr: 5.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: other
+  num_epochs: 1000
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
+    - 0.2
+    - 0.02
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+inference:
+  batch_size: 8
+  dim_t: 512
+  num_overlap: 2

data/HTDemucs4 FT Vocals/config.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+model_type: 'htdemucs'
+config_path: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\HTDemucs4 FT Vocals\model\config_musdb18_htdemucs.yaml'
+start_checkpoint: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\HTDemucs4 FT Vocals\model\04573f0d-f3cf25b2.th'

data/HTDemucs4 FT Vocals/model/04573f0d-f3cf25b2.th ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3cf25b222c4eed7cd49dd8b2c9597d50c18bd154090f7b919cfa5f93cf22c49
+size 84141271

data/HTDemucs4 FT Vocals/model/config_musdb18_htdemucs.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+audio:
+  chunk_size: 485100 # samplerate * segment
+  min_mean_abs: 0.001
+  hop_length: 1024
+training:
+  batch_size: 8
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  segment: 11
+  shift: 1
+  samplerate: 44100
+  channels: 2
+  normalize: true
+  instruments: ['drums', 'bass', 'other', 'vocals']
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  optimizer: adam
+  lr: 9.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+inference:
+  num_overlap: 4
+  batch_size: 8
+model: htdemucs
+htdemucs:  # see demucs/htdemucs.py for a detailed description
+  # Channels
+  channels: 48
+  channels_time:
+  growth: 2
+  # STFT
+  num_subbands: 1
+  nfft: 4096
+  wiener_iters: 0
+  end_iters: 0
+  wiener_residual: false
+  cac: true
+  # Main structure
+  depth: 4
+  rewrite: true
+  # Frequency Branch
+  multi_freqs: []
+  multi_freqs_depth: 3
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 3
+  dconv_depth: 2
+  dconv_comp: 8
+  dconv_init: 1e-3
+  # Before the Transformer
+  bottom_channels: 512
+  # CrossTransformer
+  # ------ Common to all
+  # Regular parameters
+  t_layers: 5
+  t_hidden_scale: 4.0
+  t_heads: 8
+  t_dropout: 0.0
+  t_layer_scale: True
+  t_gelu: True
+  # ------------- Positional Embedding
+  t_emb: sin
+  t_max_positions: 10000 # for the scaled embedding
+  t_max_period: 10000.0
+  t_weight_pos_embed: 1.0
+  t_cape_mean_normalize: True
+  t_cape_augment: True
+  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
+  t_sin_random_shift: 0
+  # ------------- norm before a transformer encoder
+  t_norm_in: True
+  t_norm_in_group: False
+  # ------------- norm inside the encoder
+  t_group_norm: False
+  t_norm_first: True
+  t_norm_out: True
+  # ------------- optim
+  t_weight_decay: 0.0
+  t_lr:
+  # ------------- sparsity
+  t_sparse_self_attn: False
+  t_sparse_cross_attn: False
+  t_mask_type: diag
+  t_mask_random_seed: 42
+  t_sparse_attn_window: 400
+  t_global_window: 100
+  t_sparsity: 0.95
+  t_auto_sparsity: False
+  # Cross Encoder First (False)
+  t_cross_first: False
+  # Weight init
+  rescale: 0.1

data/MelBand Roformer (anvuew edition)/config.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+model_type: 'mel_band_roformer'
+config_path: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\MelBand Roformer (anvuew edition)\model\dereverb_mel_band_roformer_anvuew.yaml'
+start_checkpoint: 'C:\Users\user\Downloads\Music-Source-Separation-Training\data\MelBand Roformer (anvuew edition)\model\dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt'

data/MelBand Roformer (anvuew edition)/model/dereverb_mel_band_roformer_anvuew.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.000
+model:
+  dim: 384
+  depth: 6
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  num_bands: 60
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: True
+  dim_freqs_in: 1025
+  sample_rate: 44100  # needed for mel filter bank from librosa
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: False
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 3
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - noreverb
+  - reverb
+  lr: 5.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: noreverb
+  num_epochs: 1000
+  num_steps: 4000
+  q: 0.95
+  coarse_loss_clip: false
+  ema_momentum: 0.999
+  optimizer: adamw
+  other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.1
+  loudness_max: 1.0
+  mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
+    - 0.2
+    - 0.02
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+inference:
+  batch_size: 1
+  dim_t: 801
+  num_overlap: 2

data/MelBand Roformer (anvuew edition)/model/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9262877b87e9ebb0fb808a456b0a411fa677f5df31c8383c1254af531c078970
+size 913107578