cemsubakan commited on
Commit
c858225
1 Parent(s): da0772f

adding the .ckpt files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ psi_model.ckpt filter=lfs diff=lfs merge=lfs -text
37
+ classifier.ckpt filter=lfs diff=lfs merge=lfs -text
38
+ embedding_model.ckpt filter=lfs diff=lfs merge=lfs -text
39
+ embedding_modelft.ckpt filter=lfs diff=lfs merge=lfs -text
.hyperparams.yaml.swp ADDED
Binary file (16.4 kB). View file
 
classifier.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:763e786dac8c54c37de3266842591291482f93f94ccdae461ed531b479cd4b7b
3
+ size 242797
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e2ff27d9df0e462b15e39ae9dbbdc413e6e768a89f871e6c197dffedc00ecf
3
+ size 15264103
embedding_modelft.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e0c45521b584990126d77c3cae3cb7031070beda400628aad8ca0ba227f162
3
+ size 15264903
hyperparams.yaml ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2023-07-14 from:
2
+ # /data2/cloned_repos/speechbrain-clone/recipes/ESC50/interpret/hparams/piq.yaml
3
+ # yamllint disable
4
+ # #################################
5
+ # The recipe for training PIQ on the ESC50 dataset.
6
+ #
7
+ # Author:
8
+ # * Cem Subakan 2022, 2023
9
+ # * Francesco Paissan 2022, 2023
10
+ # (based on the SpeechBrain UrbanSound8k recipe)
11
+ # #################################
12
+
13
+ # Seed needs to be set at top of yaml, before objects with parameters are made
14
+ seed: 1234
15
+ __set_seed: !!python/object/apply:torch.manual_seed [1234]
16
+
17
+ # Set up folders for reading from and writing to
18
+ # Dataset must already exist at `audio_data_folder`
19
+ data_folder: /data2/ESC-50-master
20
+ # e.g., /localscratch/UrbanSound8K
21
+ audio_data_folder: /data2/ESC-50-master/audio
22
+
23
+ experiment_name: piq
24
+ output_folder: ./results/piq/1234
25
+ save_folder: ./results/piq/1234/save
26
+ train_log: ./results/piq/1234/train_log.txt
27
+
28
+ test_only: false
29
+ save_interpretations: true
30
+ interpret_period: 10
31
+
32
+ # Tensorboard logs
33
+ use_tensorboard: false
34
+ tensorboard_logs_folder: ./results/piq/1234/tb_logs/
35
+
36
+ # Path where data manifest files will be stored
37
+ train_annotation: /data2/ESC-50-master/manifest/train.json
38
+ valid_annotation: /data2/ESC-50-master/manifest/valid.json
39
+ test_annotation: /data2/ESC-50-master/manifest/test.json
40
+
41
+ # To standardize results, UrbanSound8k has pre-separated samples into
42
+ # 10 folds for multi-fold validation
43
+ train_fold_nums: [1, 2, 3]
44
+ valid_fold_nums: [4]
45
+ test_fold_nums: [5]
46
+ skip_manifest_creation: false
47
+
48
+ ckpt_interval_minutes: 15 # save checkpoint every N min
49
+
50
+ # Training parameters
51
+ number_of_epochs: 200
52
+ batch_size: 16
53
+ lr: 0.0002
54
+ sample_rate: 16000
55
+ use_vq: true
56
+ rec_loss_coef: 1
57
+ use_mask_output: true
58
+ mask_th: 0.35
59
+
60
+ device: cuda
61
+
62
+ # Feature parameters
63
+ n_mels: 80
64
+
65
+ # Number of classes
66
+ out_n_neurons: 50
67
+
68
+ shuffle: true
69
+ dataloader_options:
70
+ batch_size: 16
71
+ shuffle: true
72
+ num_workers: 0
73
+
74
+ epoch_counter: &id001 !new:speechbrain.utils.epoch_loop.EpochCounter
75
+
76
+ limit: 200
77
+
78
+ opt_class: !name:torch.optim.Adam
79
+ lr: 0.0002
80
+ weight_decay: 0.000002
81
+
82
+ lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
83
+ factor: 0.5
84
+ patience: 3
85
+ dont_halve_until_epoch: 100
86
+
87
+ # Logging + checkpoints
88
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
89
+ save_file: ./results/piq/1234/train_log.txt
90
+
91
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
92
+ checkpoints_dir: ./results/piq/1234/save
93
+ recoverables:
94
+ psi_model: &id004 !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
95
+ dim: 256
96
+ K: 1024
97
+ shared_keys: 0
98
+ activate_class_partitioning: true
99
+ use_adapter: true
100
+ adapter_reduce_dim: true
101
+
102
+ counter: *id001
103
+ use_pretrained: true
104
+
105
+ # embedding_model: !new:custom_models.Conv2dEncoder_v2
106
+ embedding_model: &id002 !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
107
+ dim: 256
108
+
109
+ classifier: &id003 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
110
+ input_size: 256
111
+ out_neurons: 50
112
+ lin_blocks: 1
113
+
114
+
115
+ # Interpretation hyperparams
116
+ K: 1024
117
+
118
+ # pre-processing
119
+ n_fft: 1024
120
+ spec_mag_power: 0.5
121
+ hop_length: 11.6099
122
+ win_length: 23.2199
123
+ compute_stft: &id005 !new:speechbrain.processing.features.STFT
124
+ n_fft: 1024
125
+ hop_length: 11.6099
126
+ win_length: 23.2199
127
+ sample_rate: 16000
128
+
129
+ compute_fbank: &id006 !new:speechbrain.processing.features.Filterbank
130
+ n_mels: 80
131
+ n_fft: 1024
132
+ sample_rate: 16000
133
+
134
+ compute_istft: &id007 !new:speechbrain.processing.features.ISTFT
135
+ sample_rate: 16000
136
+ hop_length: 11.6099
137
+ win_length: 23.2199
138
+
139
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
140
+ psi_model: *id004
141
+ modules:
142
+ compute_stft: *id005
143
+ compute_fbank: *id006
144
+ compute_istft: *id007
145
+ psi: *id004
146
+ embedding_model: !ref <embedding_model>
147
+ classifier: !ref <classifier>
148
+
149
+ embedding_model_path: fpaissan/conv2d_us8k/embedding_modelft.ckpt
150
+ classifier_model_path: fpaissan/conv2d_us8k/classifier.ckpt
151
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
152
+ loadables:
153
+ embedding_model: !ref <embedding_model>
154
+ classifier: !ref <classifier>
155
+ psi: !ref <psi_model>
156
+ label_encoder: !ref <label_encoder>
157
+ paths:
158
+ embedding_model: fpaissan/conv2d_us8k/embedding_modelft.ckpt
159
+ classifier: fpaissan/conv2d_us8k/classifier.ckpt
160
+ psi: /data2/PIQ-ESC50/psi_model.ckpt
161
+ label_encoder: speechbrain/cnn14-esc50/label_encoder.txt
mix.wav ADDED
Binary file (320 kB). View file
 
mix_cry.wav ADDED
Binary file (320 kB). View file
 
mix_dog.wav ADDED
Binary file (320 kB). View file
 
psi_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8e6a454b68906362735de1dc285e9c11bee729d30d81e8195506e5901bcce9
3
+ size 27196920