model: arch: shikra version: 'v1' # vit encoder cache_dir: None vit_model: "openai/clip-vit-large-patch14" freeze_vit: True # finetune config freeze_backbone: False tune_mm_mlp_adapter: False freeze_mm_mlp_adapter: False # model config mm_vision_select_layer: -2 model_max_length: 2048 # data process config image_token_len: 256 mm_use_im_start_end: True # training config bf16: False fp16: True preprocess: vis_processor: train: name: "clip_image_train" proc_type: "openai/clip-vit-large-patch14" eval: name: "clip_image_eval" proc_type: "openai/clip-vit-large-patch14" text_processor: train: name: "blip_caption" eval: name: "blip_caption"