{ "_name_or_path": "facebook/wav2vec2-xls-r-300m", "activation_dropout": 0.0, "adapter_kernel_size": 3, "adapter_stride": 2, "add_adapter": false, "apply_spec_augment": true, "architectures": [ "Wav2Vec2ForSequenceClassification" ], "attention_dropout": 0.0, "bos_token_id": 1, "classifier_proj_size": 256, "codevector_dim": 768, "contrastive_logits_temperature": 0.1, "conv_bias": true, "conv_dim": [ 512, 512, 512, 512, 512, 512, 512 ], "conv_kernel": [ 10, 3, 3, 3, 3, 2, 2 ], "conv_stride": [ 5, 2, 2, 2, 2, 2, 2 ], "ctc_loss_reduction": "mean", "ctc_zero_infinity": false, "diversity_loss_weight": 0.1, "do_stable_layer_norm": true, "eos_token_id": 2, "feat_extract_activation": "gelu", "feat_extract_dropout": 0.0, "feat_extract_norm": "layer", "feat_proj_dropout": 0.0, "feat_quantizer_dropout": 0.0, "final_dropout": 0.0, "hidden_act": "gelu", "hidden_dropout": 0.0, "hidden_size": 1024, "id2label": { "0": "af_za", "1": "am_et", "2": "ar_eg", "3": "as_in", "4": "ast_es", "5": "az_az", "6": "be_by", "7": "bn_in", "8": "bs_ba", "9": "ca_es", "10": "ceb_ph", "11": "cmn_hans_cn", "12": "cs_cz", "13": "cy_gb", "14": "da_dk", "15": "de_de", "16": "el_gr", "17": "en_us", "18": "es_419", "19": "et_ee", "20": "fa_ir", "21": "ff_sn", "22": "fi_fi", "23": "fil_ph", "24": "fr_fr", "25": "ga_ie", "26": "gl_es", "27": "gu_in", "28": "ha_ng", "29": "he_il", "30": "hi_in", "31": "hr_hr", "32": "hu_hu", "33": "hy_am", "34": "id_id", "35": "ig_ng", "36": "is_is", "37": "it_it", "38": "ja_jp", "39": "jv_id", "40": "ka_ge", "41": "kam_ke", "42": "kea_cv", "43": "kk_kz", "44": "km_kh", "45": "kn_in", "46": "ko_kr", "47": "ku_arab_iq", "48": "ky_kg", "49": "lb_lu", "50": "lg_ug", "51": "ln_cd", "52": "lo_la", "53": "lt_lt", "54": "luo_ke", "55": "lv_lv", "56": "mi_nz", "57": "mk_mk", "58": "ml_in", "59": "mn_mn", "60": "mr_in", "61": "ms_my", "62": "mt_mt", "63": "my_mm", "64": "nb_no", "65": "ne_np", "66": "nl_nl", "67": "nso_za", "68": "ny_mw", "69": "oci_fr", "70": "om_et", "71": "or_in", "72": "pa_in", "73": "pl_pl", "74": "ps_af", "75": "pt_br", "76": "ro_ro", "77": "ru_ru", "78": "rup_bg", "79": "sd_arab_in", "80": "sk_sk", "81": "sl_si", "82": "sn_zw", "83": "so_so", "84": "sr_rs", "85": "sv_se", "86": "sw_ke", "87": "ta_in", "88": "te_in", "89": "tg_tj", "90": "th_th", "91": "tr_tr", "92": "uk_ua", "93": "umb_ao", "94": "ur_pk", "95": "uz_uz", "96": "vi_vn", "97": "wo_sn", "98": "xh_za", "99": "yo_ng", "100": "yue_hant_hk", "101": "zu_za" }, "initializer_range": 0.02, "intermediate_size": 4096, "label2id": { "af_za": 0, "am_et": 1, "ar_eg": 2, "as_in": 3, "ast_es": 4, "az_az": 5, "be_by": 6, "bn_in": 7, "bs_ba": 8, "ca_es": 9, "ceb_ph": 10, "cmn_hans_cn": 11, "cs_cz": 12, "cy_gb": 13, "da_dk": 14, "de_de": 15, "el_gr": 16, "en_us": 17, "es_419": 18, "et_ee": 19, "fa_ir": 20, "ff_sn": 21, "fi_fi": 22, "fil_ph": 23, "fr_fr": 24, "ga_ie": 25, "gl_es": 26, "gu_in": 27, "ha_ng": 28, "he_il": 29, "hi_in": 30, "hr_hr": 31, "hu_hu": 32, "hy_am": 33, "id_id": 34, "ig_ng": 35, "is_is": 36, "it_it": 37, "ja_jp": 38, "jv_id": 39, "ka_ge": 40, "kam_ke": 41, "kea_cv": 42, "kk_kz": 43, "km_kh": 44, "kn_in": 45, "ko_kr": 46, "ku_arab_iq": 47, "ky_kg": 48, "lb_lu": 49, "lg_ug": 50, "ln_cd": 51, "lo_la": 52, "lt_lt": 53, "luo_ke": 54, "lv_lv": 55, "mi_nz": 56, "mk_mk": 57, "ml_in": 58, "mn_mn": 59, "mr_in": 60, "ms_my": 61, "mt_mt": 62, "my_mm": 63, "nb_no": 64, "ne_np": 65, "nl_nl": 66, "nso_za": 67, "ny_mw": 68, "oci_fr": 69, "om_et": 70, "or_in": 71, "pa_in": 72, "pl_pl": 73, "ps_af": 74, "pt_br": 75, "ro_ro": 76, "ru_ru": 77, "rup_bg": 78, "sd_arab_in": 79, "sk_sk": 80, "sl_si": 81, "sn_zw": 82, "so_so": 83, "sr_rs": 84, "sv_se": 85, "sw_ke": 86, "ta_in": 87, "te_in": 88, "tg_tj": 89, "th_th": 90, "tr_tr": 91, "uk_ua": 92, "umb_ao": 93, "ur_pk": 94, "uz_uz": 95, "vi_vn": 96, "wo_sn": 97, "xh_za": 98, "yo_ng": 99, "yue_hant_hk": 100, "zu_za": 101 }, "layer_norm_eps": 1e-05, "layerdrop": 0.0, "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_prob": 0.05, "model_type": "wav2vec2", "num_adapter_layers": 3, "num_attention_heads": 16, "num_codevector_groups": 2, "num_codevectors_per_group": 320, "num_conv_pos_embedding_groups": 16, "num_conv_pos_embeddings": 128, "num_feat_extract_layers": 7, "num_hidden_layers": 24, "num_negatives": 100, "output_hidden_size": 1024, "pad_token_id": 0, "proj_codevector_dim": 768, "tdnn_dilation": [ 1, 2, 3, 1, 1 ], "tdnn_dim": [ 512, 512, 512, 512, 1500 ], "tdnn_kernel": [ 5, 3, 3, 1, 1 ], "torch_dtype": "float16", "transformers_version": "4.18.0.dev0", "use_weighted_layer_sum": false, "vocab_size": 32, "xvector_output_dim": 512 }