arxyzan's picture
Hezar: Upload model_config.yaml
1c8a07a
name: vit_gpt2_image2text
config_type: model
encoder:
hidden_size: 768
num_hidden_layers: 12
num_attention_heads: 12
intermediate_size: 3072
hidden_act: gelu
hidden_dropout_prob: 0.0
attention_probs_dropout_prob: 0.0
initializer_range: 0.02
layer_norm_eps: 1.0e-12
image_size: 224
patch_size: 16
num_channels: 3
qkv_bias: true
encoder_stride: 16
decoder:
add_cross_attention: true
vocab_size: 42001
attn_pdrop: 0.1
bos_token_id: 5
embd_pdrop: 0.1
eos_token_id: 5
gradient_checkpointing: false
initializer_range: 0.02
layer_norm_epsilon: 1.0e-05
model_type: gpt2
n_ctx: 1024
n_embd: 768
n_head: 12
n_inner: null
n_layer: 12
n_positions: 1024
resid_pdrop: 0.1
summary_activation: false
summary_first_dropout: 0.1
use_cache: true
generation:
bos_token_id: 0
decoder_start_token_id: 0
early_stopping: true
eos_token_id: 2
length_penalty: 2.0
max_new_tokens: 24
no_repeat_ngram_size: 3
num_beams: 4
pad_token_id: 1