jukebox-dummy / config.json
ArthurZ's picture
ArthurZ HF staff
Update config.json
e03afff
{
"activation_function": "gelu_new",
"alignment_head": [
2,
null,
null
],
"alignment_layer": [
68,
null,
null
],
"architectures": [
"JukeboxModel"
],
"attn_dropout": 0.0,
"attn_init_scale": 1.0,
"attn_order": [
12,
2,
2
],
"blocks": 16,
"bos_token_id": 50256,
"c_res": 1,
"cond_c_res": [
0,
1,
1
],
"cond_depth": [
3,
16,
16
],
"cond_dilation_cycle": [
null,
8,
8
],
"cond_dilation_growth_rate": [
1,
3,
3
],
"cond_m_conv": 1,
"cond_res_scale": false,
"cond_width": [
128,
128,
64
],
"cond_zero_out": false,
"copy_input": false,
"depth": [
2,
2,
2
],
"downs_t": [
3,
2,
2
],
"emb_dropout": 0.1,
"eos_token_id": 50256,
"fp16_params": true,
"hop_length": 256,
"init_scale": [
0.7,
1,
1
],
"initializer_range": 0.02,
"l_bins": 128,
"labels": true,
"layer_norm_epsilon": 1e-05,
"m_attn": 0.25,
"max_bow_genre_size": 1,
"max_duration": 600.0,
"merged_decoder": [
true,
false,
false
],
"min_duration": 1,
"mlp_init_scale": 0.02,
"model_type": "jukebox",
"multispec_loss_hop_length": [
240,
120,
50
],
"multispec_loss_n_fft": [
2048,
1024,
512
],
"multispec_loss_window_size": [
1200,
600,
240
],
"multispectral": 1.0,
"n_ctx": [256,256,256],
"n_embd": 768,
"n_head": 12,
"n_heads": [
2,
1,
1
],
"n_inner": null,
"n_layer": 12,
"n_positions": 1024,
"n_tokens": [
512,
0,
0
],
"n_vocab": 79,
"name": "AudioSamples",
"nb_priors": 3,
"pos_init": false,
"prime_attn_dropout": 0.0,
"prime_attn_order": [
2,
0,
0
],
"prime_blocks": 32,
"prime_c_res": 1,
"prime_cond_c_res": [
0,
1,
1
],
"prime_depth": [
18,
3,
3
],
"prime_emb_dropout": 0.0,
"prime_heads": 4,
"prime_init_scale": [
0.1,
0.4,
0.4
],
"prime_loss_fraction": [
0.4,
0.0,
0.0
],
"prime_m_attn": 0.25,
"prime_m_mlp": 1.0,
"prime_pos_init": false,
"prime_res_scale": false,
"prime_resid_dropout": 0.0,
"prime_spread": null,
"prime_width": [
128,
128,
128
],
"prime_zero_out": false,
"priors_width": [
128,
64,
32
],
"reorder_and_upcast_attn": false,
"res_scale": false,
"resid_dropout": 0.0,
"sample_hop_length": 30000,
"sample_length": 44032,
"sample_length_in_seconds": 1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"single_enc_dec": [
true,
false,
false
],
"spectral": 0.0,
"spread": null,
"sr": 44100,
"strides_t": [
2,
2,
2
],
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"t_bins": 64,
"torch_dtype": "float32",
"transformers_version": "4.19.0.dev0",
"use_cache": true,
"use_nonrelative_specloss": true,
"use_tokens": [
true,
false,
false
],
"vocab_size": 50257,
"vq_vae_codebook_dimension": 128,
"vq_vae_commit": 0.02,
"vq_vae_conv_block_depth": 4,
"vq_vae_conv_block_width": 64,
"vq_vae_depth": 4,
"vq_vae_dilation_cycle": null,
"vq_vae_dilation_growth_rate": 3,
"vq_vae_downs_t": [
3,
2,
2
],
"vq_vae_emmbedding_width": 128,
"vq_vae_levels": 3,
"vq_vae_lmu": 0.99,
"vq_vae_m_conv": 1,
"vq_vae_multipliers": [
2,
1,
1
],
"vq_vae_reverse_decoder_dilation": 1,
"vq_vae_strides_t": [
2,
2,
2
],
"vq_vae_width": 64,
"vqvae_z_shapes": [
[
344
],
[
1376
],
[
5504
]
],
"width": [
128,
64,
32
],
"y_bins": [
[120,4111],[120,4111],[120,4111]
],
"zero_out": false
}