|
|
|
|
|
|
|
|
|
|
|
exp_name = "exp_htsat_2048d" |
|
workspace = "/home/kechen/Research/HTSAT" |
|
dataset_path = "/home/Research/audioset" |
|
desed_folder = "/home/Research/DESED" |
|
|
|
dataset_type = "audioset" |
|
|
|
loss_type = "clip_bce" |
|
balanced_data = True |
|
|
|
resume_checkpoint = "/home/kechen/Research/Latent_ASP/model_backup/htsat_audioset_2048d.ckpt" |
|
|
|
esc_fold = 0 |
|
|
|
debug = False |
|
|
|
random_seed = 970131 |
|
batch_size = 32 * 4 |
|
learning_rate = 1e-3 |
|
max_epoch = 100 |
|
num_workers = 3 |
|
|
|
lr_scheduler_epoch = [10,20,30] |
|
lr_rate = [0.02, 0.05, 0.1] |
|
|
|
|
|
enable_token_label = False |
|
class_map_path = "class_hier_map.npy" |
|
class_filter = None |
|
retrieval_index = [15382, 9202, 130, 17618, 17157, 17516, 16356, 6165, 13992, 9238, 5550, 5733, 1914, 1600, 3450, 13735, 11108, 3762, |
|
9840, 11318, 8131, 4429, 16748, 4992, 16783, 12691, 4945, 8779, 2805, 9418, 2797, 14357, 5603, 212, 3852, 12666, 1338, 10269, 2388, 8260, 4293, 14454, 7677, 11253, 5060, 14938, 8840, 4542, 2627, 16336, 8992, 15496, 11140, 446, 6126, 10691, 8624, 10127, 9068, 16710, 10155, 14358, 7567, 5695, 2354, 8057, 17635, 133, 16183, 14535, 7248, 4560, 14429, 2463, 10773, 113, 2462, 9223, 4929, 14274, 4716, 17307, 4617, 2132, 11083, 1039, 1403, 9621, 13936, 2229, 2875, 17840, 9359, 13311, 9790, 13288, 4750, 17052, 8260, 14900] |
|
token_label_range = [0.2,0.6] |
|
enable_time_shift = False |
|
enable_label_enhance = False |
|
enable_repeat_mode = False |
|
|
|
|
|
|
|
|
|
enable_tscam = True |
|
|
|
|
|
sample_rate = 32000 |
|
clip_samples = sample_rate * 10 |
|
window_size = 1024 |
|
hop_size = 320 |
|
mel_bins = 64 |
|
fmin = 50 |
|
fmax = 14000 |
|
shift_max = int(clip_samples * 0.5) |
|
|
|
|
|
classes_num = 527 |
|
patch_size = (25, 4) |
|
crop_size = None |
|
|
|
|
|
htsat_window_size = 8 |
|
htsat_spec_size = 256 |
|
htsat_patch_size = 4 |
|
htsat_stride = (4, 4) |
|
htsat_num_head = [4,8,16,32] |
|
htsat_dim = 256 |
|
htsat_depth = [2,2,6,2] |
|
|
|
swin_pretrain_path = None |
|
|
|
|
|
|
|
htsat_attn_heatmap = False |
|
htsat_hier_output = False |
|
htsat_use_max = False |
|
|
|
|
|
|
|
ensemble_checkpoints = [] |
|
ensemble_strides = [] |
|
|
|
|
|
|
|
wa_folder = "/home/version_0/checkpoints/" |
|
|
|
wa_model_path = "HTSAT_AudioSet_Saved_x.ckpt" |
|
|
|
esm_model_pathes = [ |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_1.ckpt", |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_2.ckpt", |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_3.ckpt", |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_4.ckpt", |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_5.ckpt", |
|
"/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_6.ckpt" |
|
] |
|
|
|
|
|
heatmap_dir = "/home/Research/heatmap_output" |
|
test_file = "htsat-test-ensemble" |
|
fl_local = False |
|
fl_dataset = "/home/Research/desed/desed_eval.npy" |
|
fl_class_num = [ |
|
"Speech", "Frying", "Dishes", "Running_water", |
|
"Blender", "Electric_shaver_toothbrush", "Alarm_bell_ringing", |
|
"Cat", "Dog", "Vacuum_cleaner" |
|
] |
|
|
|
|
|
fl_audioset_mapping = [ |
|
[0,1,2,3,4,5,6,7], |
|
[366, 367, 368], |
|
[364], |
|
[288, 289, 290, 291, 292, 293, 294, 295, 296, 297], |
|
[369], |
|
[382], |
|
[310, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402], |
|
[81, 82, 83, 84, 85], |
|
[74, 75, 76, 77, 78, 79], |
|
[377] |
|
] |