novateur commited on
Commit
ad2ccba
·
verified ·
1 Parent(s): 26ed1a4

Update wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml

Browse files
wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml CHANGED
@@ -1,24 +1,24 @@
1
  seed_everything: 3407
2
 
3
  data:
4
- class_path: vocos.dataset.VocosDataModule
5
  init_args:
6
  train_params:
7
- filelist_path: /cpfs_speech/jishengpeng/Code/dataprocess/path/WavTokenizer/medium_train_audio_music
8
  sampling_rate: 24000
9
  num_samples: 72000
10
  batch_size: 39 #18
11
  num_workers: 8
12
 
13
  val_params:
14
- filelist_path: /cpfs_speech/jishengpeng/Code/dataprocess/path/WavTokenizer/medium_test_audio_music
15
  sampling_rate: 24000
16
  num_samples: 72000
17
  batch_size: 2 # 10
18
  num_workers: 8
19
 
20
  model:
21
- class_path: vocos.experiment.VocosEncodecExp
22
  init_args:
23
  sample_rate: 24000
24
  initial_learning_rate: 2e-4
@@ -32,12 +32,12 @@ model:
32
  evaluate_pesq: true
33
  evaluate_periodicty: true
34
 
35
- resume: true
36
  resume_config: /cpfs_speech/jishengpeng/Code/WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml
37
- resume_model: /cpfs_speech/jishengpeng/Code/WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/lightning_logs/version_2/checkpoints/vocos_checkpoint_epoch=1_step=45240_val_loss=8.8358.ckpt
38
 
39
  feature_extractor:
40
- class_path: vocos.feature_extractors.EncodecFeatures
41
  init_args:
42
  encodec_model: encodec_24khz
43
  bandwidths: [6.6, 6.6, 6.6, 6.6]
@@ -48,7 +48,7 @@ model:
48
  vq_kmeans: 200
49
 
50
  backbone:
51
- class_path: vocos.models.VocosBackbone
52
  init_args:
53
  input_channels: 512
54
  dim: 768
@@ -57,18 +57,18 @@ model:
57
  adanorm_num_embeddings: 4 # len(bandwidths)
58
 
59
  head:
60
- class_path: vocos.heads.ISTFTHead
61
  init_args:
62
  dim: 768
63
- n_fft: 1280 #4*hop_length
64
- hop_length: 320 # 8*5*4*2
65
  padding: same
66
 
67
  trainer:
68
  logger:
69
  class_path: pytorch_lightning.loggers.TensorBoardLogger
70
  init_args:
71
- save_dir: /cpfs_speech/jishengpeng/Code/WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/
72
  callbacks:
73
  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
74
  - class_path: pytorch_lightning.callbacks.ModelSummary
@@ -77,10 +77,10 @@ trainer:
77
  - class_path: pytorch_lightning.callbacks.ModelCheckpoint
78
  init_args:
79
  monitor: val_loss
80
- filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
81
  save_top_k: 10
82
  save_last: true
83
- - class_path: vocos.helpers.GradNormCallback
84
 
85
  # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
86
  # This equals to 1M steps per generator and 1M per discriminator
 
1
  seed_everything: 3407
2
 
3
  data:
4
+ class_path: decoder.dataset.VocosDataModule
5
  init_args:
6
  train_params:
7
+ filelist_path: ./WavTokenizer/medium_train_audio_music
8
  sampling_rate: 24000
9
  num_samples: 72000
10
  batch_size: 39 #18
11
  num_workers: 8
12
 
13
  val_params:
14
+ filelist_path: ./WavTokenizer/medium_test_audio_music
15
  sampling_rate: 24000
16
  num_samples: 72000
17
  batch_size: 2 # 10
18
  num_workers: 8
19
 
20
  model:
21
+ class_path: decoder.experiment.VocosEncodecExp
22
  init_args:
23
  sample_rate: 24000
24
  initial_learning_rate: 2e-4
 
32
  evaluate_pesq: true
33
  evaluate_periodicty: true
34
 
35
+ resume: false
36
  resume_config: /cpfs_speech/jishengpeng/Code/WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml
37
+ resume_model: /cpfs_speech/jishengpeng/Code/WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/lightning_logs/version_2/checkpoints/example.ckpt
38
 
39
  feature_extractor:
40
+ class_path: decoder.feature_extractors.EncodecFeatures
41
  init_args:
42
  encodec_model: encodec_24khz
43
  bandwidths: [6.6, 6.6, 6.6, 6.6]
 
48
  vq_kmeans: 200
49
 
50
  backbone:
51
+ class_path: decoder.models.VocosBackbone
52
  init_args:
53
  input_channels: 512
54
  dim: 768
 
57
  adanorm_num_embeddings: 4 # len(bandwidths)
58
 
59
  head:
60
+ class_path: decoder.heads.ISTFTHead
61
  init_args:
62
  dim: 768
63
+ n_fft: 1280
64
+ hop_length: 320
65
  padding: same
66
 
67
  trainer:
68
  logger:
69
  class_path: pytorch_lightning.loggers.TensorBoardLogger
70
  init_args:
71
+ save_dir: ./WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/
72
  callbacks:
73
  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
74
  - class_path: pytorch_lightning.callbacks.ModelSummary
 
77
  - class_path: pytorch_lightning.callbacks.ModelCheckpoint
78
  init_args:
79
  monitor: val_loss
80
+ filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
81
  save_top_k: 10
82
  save_last: true
83
+ - class_path: decoder.helpers.GradNormCallback
84
 
85
  # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
86
  # This equals to 1M steps per generator and 1M per discriminator