Spaces:
Runtime error
Runtime error
File size: 3,895 Bytes
f239efc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from tasks.train.instruction_data import *
# ========================= data ==========================
# train_corpus = "videochat2_instruction"
train_corpus = "videochat2_instruction_full"
train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation
test_file = dict()
test_types = []
num_workers = 8
save_steps=10000
ckpt_steps=1000
stop_key = None
deepspeed=False
highres=None
# ========================= input ==========================
num_frames = 16
num_frames_test = 1
batch_size = 1
gradient_accumulation_steps=16
max_txt_l = 512
max_train_steps=None
pre_text = False
gradient_checkpointing=False
inputs = dict(
image_res=336,
video_input=dict(
num_frames="${num_frames}",
sample_type="rand",
num_frames_test="${num_frames_test}",
sample_type_test="middle",
random_aug=False,
),
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
batch_size=dict(image="${batch_size}", video="${batch_size}"),
batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)
model = dict(
repo_id="llava-hf/llava-1.5-7b-hf",
pretrained_path=None,
load_from_origin=False,
origin_vision="",
origin_llm="",
vision_encoder=dict(
name="vit_l14", # somehow need this to tell the dataset the mean std of pretrained model
),
torch_dtype='bfloat16',
freeze_projector=False,
freeze_lm=True,
freeze_vision_tower=True,
lora_target_modules=["q_proj", "v_proj"], # for llama/mistral/gemma
use_lora=True,
lora_r=128,
lora_alpha=32,
lora_dropout=0.05,
num_frames="${num_frames}",
pooling_method='avg',
use_pooling=True,
frame_shape=(24,24),
pooling_shape=(16,8,8),
)
preprocess = dict(
system="",
mm_alone=True,
image_token_index=64002,
random_shuffle=True,
add_second_msg=True,
roles=['<|im_start|>user\n', '<|im_start|>assistant\n'],
end_signal=('<|im_end|>\n', '<|im_end|>\n'),
begin_signal='',
dataset_image_placeholder='<Image></Image>',
dataset_video_placeholder='<Video></Video>',
max_txt_l = "${max_txt_l}",
ignore_index=-100, # same as torch softmax ignore index
center_pad=False,
longest_edge=762,
shortest_edge=336,
clip_transform=False,
num_frames="${num_frames}",
)
optimizer = dict(
opt="adamW",
lr=2e-5,
opt_betas=[0.9, 0.999], # default
weight_decay=0.02,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(enable=False, module_names=[], lr=1e-3),
)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
scheduler = dict(
is_videochat2_custom=False,
sched="cosine",
epochs=2,
warmup_ratio=0.2,
min_lr_multi=0.25)
evaluate = False
deep_fusion = False
evaluation = dict(
eval_frame_ensemble="concat", # [concat, max, mean, lse]
eval_x_only=False,
k_test=128,
eval_offload=True, # offload gpu tensors to cpu to save memory.
)
fp16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=False,
entity="user", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="videochat2", # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"
# ========================= others ==========================
output_dir = None # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 5
metric_window_size=10 # window size for metric
seed = 42
report_to='tensorboard'
save_latest = True
auto_resume = True
pretrained_path = "" # path to pretrained model weights, for resume only?
|