name: vit_gpt2_image2text config_type: model encoder: hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: gelu hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 initializer_range: 0.02 layer_norm_eps: 1.0e-12 image_size: 224 patch_size: 16 num_channels: 3 qkv_bias: true encoder_stride: 16 decoder: add_cross_attention: true vocab_size: 42001 attn_pdrop: 0.1 bos_token_id: 5 embd_pdrop: 0.1 eos_token_id: 5 gradient_checkpointing: false initializer_range: 0.02 layer_norm_epsilon: 1.0e-05 model_type: gpt2 n_ctx: 1024 n_embd: 768 n_head: 12 n_inner: null n_layer: 12 n_positions: 1024 resid_pdrop: 0.1 summary_activation: false summary_first_dropout: 0.1 use_cache: true generation: bos_token_id: 0 decoder_start_token_id: 0 early_stopping: true eos_token_id: 2 length_penalty: 2.0 max_new_tokens: 24 no_repeat_ngram_size: 3 num_beams: 4 pad_token_id: 1