File size: 3,895 Bytes
f239efc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from tasks.train.instruction_data import *

# ========================= data ==========================
# train_corpus = "videochat2_instruction"
train_corpus = "videochat2_instruction_full"

train_file = "${available_corpus[${train_corpus}]}"  # for lazy evaluation
test_file = dict()
test_types = []
num_workers = 8
save_steps=10000
ckpt_steps=1000
stop_key = None
deepspeed=False
highres=None
# ========================= input ==========================
num_frames = 16
num_frames_test = 1
batch_size = 1
gradient_accumulation_steps=16
max_txt_l = 512
max_train_steps=None
pre_text = False
gradient_checkpointing=False
inputs = dict(
    image_res=336,
    video_input=dict(
        num_frames="${num_frames}",
        sample_type="rand",
        num_frames_test="${num_frames_test}",
        sample_type_test="middle",
        random_aug=False,
    ),
    max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
    batch_size=dict(image="${batch_size}", video="${batch_size}"),
    batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)

model = dict(
    repo_id="llava-hf/llava-1.5-7b-hf",
    pretrained_path=None,
    load_from_origin=False,
    origin_vision="",
    origin_llm="",
    vision_encoder=dict(
        name="vit_l14", # somehow need this to tell the dataset the mean std of pretrained model
    ),
    torch_dtype='bfloat16',
    freeze_projector=False,
    freeze_lm=True,
    freeze_vision_tower=True,
    lora_target_modules=["q_proj", "v_proj"], # for llama/mistral/gemma
    use_lora=True,
    lora_r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    num_frames="${num_frames}",
    pooling_method='avg',
    use_pooling=True,
    frame_shape=(24,24),
    pooling_shape=(16,8,8),
)
preprocess = dict(
    system="",
    mm_alone=True,
    image_token_index=64002,
    random_shuffle=True,
    add_second_msg=True,
    roles=['<|im_start|>user\n', '<|im_start|>assistant\n'],
    end_signal=('<|im_end|>\n', '<|im_end|>\n'),
    begin_signal='',
    dataset_image_placeholder='<Image></Image>',
    dataset_video_placeholder='<Video></Video>',
    max_txt_l = "${max_txt_l}",
    ignore_index=-100, # same as torch softmax ignore index 
    center_pad=False,
    longest_edge=762,
    shortest_edge=336,
    clip_transform=False,
    num_frames="${num_frames}",
)


optimizer = dict(
    opt="adamW",
    lr=2e-5,
    opt_betas=[0.9, 0.999],  # default
    weight_decay=0.02,
    max_grad_norm=-1,  # requires a positive float, use -1 to disable
    # use a different lr for some modules, e.g., larger lr for new modules
    different_lr=dict(enable=False, module_names=[], lr=1e-3),
)

# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
scheduler = dict(
    is_videochat2_custom=False,
    sched="cosine", 
    epochs=2, 
    warmup_ratio=0.2,
    min_lr_multi=0.25)

evaluate = False
deep_fusion = False
evaluation = dict(
    eval_frame_ensemble="concat",  # [concat, max, mean, lse]
    eval_x_only=False,
    k_test=128,
    eval_offload=True,  # offload gpu tensors to cpu to save memory.
)

fp16 = True
gradient_checkpointing = True

# ========================= wandb ==========================
wandb = dict(
    enable=False,
    entity="user",  # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
    project="videochat2",  # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"

# ========================= others ==========================
output_dir = None  # output dir
resume = False  # if True, load optimizer and scheduler states as well
debug = False
log_freq = 5
metric_window_size=10 # window size for metric
seed = 42
report_to='tensorboard'
save_latest = True
auto_resume = True
pretrained_path = ""  # path to pretrained model weights, for resume only?