wav2vec2-large-xlsr-53 / config.json
1 {
2 "activation_dropout": 0.0,
3 "apply_spec_augment": true,
4 "architectures": [
5 "Wav2Vec2ForPreTraining"
6 ],
7 "attention_dropout": 0.1,
8 "bos_token_id": 1,
9 "codevector_dim": 768,
10 "contrastive_logits_temperature": 0.1,
11 "conv_bias": true,
12 "conv_dim": [
13 512,
14 512,
15 512,
16 512,
17 512,
18 512,
19 512
20 ],
21 "conv_kernel": [
22 10,
23 3,
24 3,
25 3,
26 3,
27 2,
28 2
29 ],
30 "conv_stride": [
31 5,
32 2,
33 2,
34 2,
35 2,
36 2,
37 2
38 ],
39 "ctc_loss_reduction": "sum",
40 "ctc_zero_infinity": false,
41 "diversity_loss_weight": 0.1,
42 "do_stable_layer_norm": true,
43 "eos_token_id": 2,
44 "feat_extract_activation": "gelu",
45 "feat_extract_dropout": 0.0,
46 "feat_extract_norm": "layer",
47 "feat_proj_dropout": 0.1,
48 "feat_quantizer_dropout": 0.0,
49 "final_dropout": 0.0,
50 "gradient_checkpointing": false,
51 "hidden_act": "gelu",
52 "hidden_dropout": 0.1,
53 "hidden_size": 1024,
54 "initializer_range": 0.02,
55 "intermediate_size": 4096,
56 "layer_norm_eps": 1e-05,
57 "layerdrop": 0.1,
58 "mask_channel_length": 10,
59 "mask_channel_min_space": 1,
60 "mask_channel_other": 0.0,
61 "mask_channel_prob": 0.0,
62 "mask_channel_selection": "static",
63 "mask_feature_length": 10,
64 "mask_feature_prob": 0.0,
65 "mask_time_length": 10,
66 "mask_time_min_space": 1,
67 "mask_time_other": 0.0,
68 "mask_time_prob": 0.075,
69 "mask_time_selection": "static",
70 "model_type": "wav2vec2",
71 "num_attention_heads": 16,
72 "num_codevector_groups": 2,
73 "num_codevectors_per_group": 320,
74 "num_conv_pos_embedding_groups": 16,
75 "num_conv_pos_embeddings": 128,
76 "num_feat_extract_layers": 7,
77 "num_hidden_layers": 24,
78 "num_negatives": 100,
79 "pad_token_id": 0,
80 "proj_codevector_dim": 768,
81 "transformers_version": "4.7.0.dev0",
82 "vocab_size": 32
83 }
84