Spaces:
Sleeping
Sleeping
model: | |
arch: minigpt_v2 | |
model_type: pretrain | |
max_txt_len: 1024 | |
image_size: 448 | |
end_sym: "</s>" | |
llama_model: "/path/to/llama_checkpoint" | |
ckpt: "/path/to/pretrained_checkpoint" | |
use_grad_checkpoint: True | |
chat_template: True | |
lora_r: 64 | |
lora_alpha: 16 | |
datasets: | |
multitask_conversation: | |
batch_size: 2 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 50 | |
llava_conversation: | |
batch_size: 2 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 30 | |
unnatural_instruction: | |
batch_size: 1 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 10 | |
refvg: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 40 | |
llava_detail: | |
batch_size: 4 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 20 | |
llava_reason: | |
batch_size: 4 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 80 | |
flickr_grounded_caption: | |
batch_size: 2 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 80 | |
flickr_CaptionToPhrase: | |
batch_size: 2 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 80 | |
flickr_ObjectToPhrase: | |
batch_size: 2 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 80 | |
coco_caption: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 10 | |
textcaps_caption: # | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 30 | |
refcoco: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 25 | |
refcocop: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 25 | |
refcocog: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 25 | |
invrefcoco: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 10 | |
invrefcocop: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 10 | |
invrefcocog: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 10 | |
coco_vqa: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 15 | |
ok_vqa: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 8 | |
aok_vqa: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 12 | |
gqa: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 50 | |
ocrvqa: | |
batch_size: 6 | |
vis_processor: | |
train: | |
name: "blip2_image_train" | |
image_size: 448 | |
text_processor: | |
train: | |
name: "blip_caption" | |
sample_ratio: 30 | |
run: | |
task: image_text_pretrain | |
# optimizer | |
lr_sched: "linear_warmup_cosine_lr" | |
init_lr: 1e-5 | |
min_lr: 1e-6 | |
warmup_lr: 1e-6 | |
weight_decay: 0.05 | |
max_epoch: 50 | |
num_workers: 6 | |
warmup_steps: 1000 | |
iters_per_epoch: 1000 | |
seed: 42 | |
output_dir: "/path/to/save_checkpoint" | |
amp: True | |
resume_ckpt_path: null | |
evaluate: False | |
train_splits: ["train"] | |
device: "cuda" | |
world_size: 1 | |
dist_url: "env://" | |
distributed: True | |
wandb_log: True | |
job_name: minigptv2_finetune | |