File size: 3,402 Bytes
08eed33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
IMAGENET1K_TRAIN = dict(
    type='ImageNet1kDatasetTrain',
    filename=
    '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl',
    image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K',
    template_file=
    '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json')
DEFAULT_TRAIN_IMAGENET1K_VARIANT = dict(
    imagenet1k_train=dict(
        type='ImageNet1kDatasetTrain',
        filename=
        '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl',
        image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K',
        template_file=
        '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json')
)
model_args = dict(
    type='llava',
    version='v1',
    cache_dir=None,
    model_name_or_path=
    '/mnt/lustre/share_data/xiechi/misc/to_weichen/llava_pretrain_final19/checkpoint-44000/',
    vision_tower=
    '/mnt/lustre/share_data/chenkeqin/VG/ckpt/openai/clip-vit-large-patch14',
    pretrain_mm_mlp_adapter=None,
    mm_vision_select_layer=-2,
    model_max_length=30000,
    freeze_backbone=False,
    tune_mm_mlp_adapter=False,
    freeze_mm_mlp_adapter=False,
    freeze_mm_projector=False,
    is_multimodal=True,
    sep_image_conv_front=False,
    image_token_len=256,
    mm_use_im_start_end=True,
    target_processor=dict(boxes=dict(type='PlainBoxFormatter')),
    process_func_args=dict(
        conv=dict(type='LLavaConvProcessV1'),
        target=dict(type='BoxFormatProcess'),
        text=dict(type='LlavaTextProcessV2'),
        image=dict(type='LlavaImageProcessorV1')),
    conv_args=dict(
        conv_template=[
            'hypnotized_v1.0', 'hypnotized_v1.1', 'hypnotized_ans_v1.0',
            'vicuna_v1.1', 'causal_v1.0', 'final_v1.0'
        ],
        transforms=dict(type='Expand2square'),
        tokenize_kwargs=dict(truncation_size=2048)),
    gen_kwargs_set_pad_token_id=True,
    gen_kwargs_set_bos_token_id=True,
    gen_kwargs_set_eos_token_id=True)
training_args = dict(
    output_dir='/mnt/cache/taiyan/unify_mllm/checkpoints/2way_weight',
    overwrite_output_dir=True,
    report_to='none',
    seed=42,
    remove_unused_columns=False,
    do_train=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=50,
    learning_rate=2e-05,
    lr_scheduler_type='cosine',
    weight_decay=0.0,
    warmup_ratio=0.03,
    evaluation_strategy='no',
    tf32=False,
    bf16=False,
    gradient_checkpointing=True,
    fsdp='full_shard auto_wrap',
    fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer',
    logging_steps=10,
    save_strategy='steps',
    save_steps=500,
    do_eval=False,
    do_predict=False,
    predict_with_generate=True,
    per_device_eval_batch_size=8,
    dataloader_num_workers=4,
    fp16=True)
data_args = dict(
    train=dict(
        type='ImageNet1kDatasetTrain',
        filename=
        '/mnt/lustre/share_data/taiyan/dataset/imagenet1k/train900_pairs.jsonl',
        image_folder='/mnt/lustre/share_data/taiyan/dataset/ImageNet-1K',
        template_file=
        '/mnt/cache/taiyan/unify_mllm/config/_base_/dataset/template/ICL.json',
        policy='policy_2way_weight'),
    validation=None,
    test=None,
    compute_metric=None,
    collator_kwargs=dict(padding=True, max_length=1024),
    gen_kwargs=dict(max_new_tokens=1024, num_beams=1),
    use_icl=True,
    shot=8)