IMAGENET1K_TRAIN = dict( type='ImageNet1kDatasetTrain', filename= '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl', image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K', template_file= '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json') DEFAULT_TRAIN_IMAGENET1K_VARIANT = dict( imagenet1k_train=dict( type='ImageNet1kDatasetTrain', filename= '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl', image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K', template_file= '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json') ) model_args = dict( type='llava', version='v1', cache_dir=None, model_name_or_path= '/mnt/lustre/share_data/xiechi/misc/to_weichen/llava_pretrain_final19/checkpoint-44000/', vision_tower= '/mnt/lustre/share_data/chenkeqin/VG/ckpt/openai/clip-vit-large-patch14', pretrain_mm_mlp_adapter=None, mm_vision_select_layer=-2, model_max_length=30000, freeze_backbone=False, tune_mm_mlp_adapter=False, freeze_mm_mlp_adapter=False, freeze_mm_projector=False, is_multimodal=True, sep_image_conv_front=False, image_token_len=256, mm_use_im_start_end=True, target_processor=dict(boxes=dict(type='PlainBoxFormatter')), process_func_args=dict( conv=dict(type='LLavaConvProcessV1'), target=dict(type='BoxFormatProcess'), text=dict(type='LlavaTextProcessV2'), image=dict(type='LlavaImageProcessorV1')), conv_args=dict( conv_template=[ 'hypnotized_v1.0', 'hypnotized_v1.1', 'hypnotized_ans_v1.0', 'vicuna_v1.1', 'causal_v1.0', 'final_v1.0' ], transforms=dict(type='Expand2square'), tokenize_kwargs=dict(truncation_size=2048)), gen_kwargs_set_pad_token_id=True, gen_kwargs_set_bos_token_id=True, gen_kwargs_set_eos_token_id=True) training_args = dict( output_dir='/mnt/cache/taiyan/unify_mllm/checkpoints/2way_weight', overwrite_output_dir=True, report_to='none', seed=42, remove_unused_columns=False, do_train=True, per_device_train_batch_size=1, gradient_accumulation_steps=1, num_train_epochs=50, learning_rate=2e-05, lr_scheduler_type='cosine', weight_decay=0.0, warmup_ratio=0.03, evaluation_strategy='no', tf32=False, bf16=False, gradient_checkpointing=True, fsdp='full_shard auto_wrap', fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', logging_steps=10, save_strategy='steps', save_steps=500, do_eval=False, do_predict=False, predict_with_generate=True, per_device_eval_batch_size=8, dataloader_num_workers=4, fp16=True) data_args = dict( train=dict( type='ImageNet1kDatasetTrain', filename= '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl', image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K', template_file= '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json', policy='policy_2way_weight'), validation=None, test=None, compute_metric=None, collator_kwargs=dict(padding=True, max_length=1024), gen_kwargs=dict(max_new_tokens=1024, num_beams=1), use_icl=True, shot=8)