|
model:
|
|
vit_model: "clip_L"
|
|
|
|
qformer_num_query_token: 16
|
|
qformer_cross_attention_freq: 1
|
|
|
|
sd_train_text_encoder: False
|
|
sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
|
|
|
|
load_finetuned: False
|
|
load_pretrained: True
|
|
|
|
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
|
|
|
|
controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
|
|
|
|
preprocess:
|
|
vis_processor:
|
|
train:
|
|
name: "blip_diffusion_inp_image_eval"
|
|
eval:
|
|
name: "blip_diffusion_inp_image_eval"
|
|
text_processor:
|
|
train:
|
|
name: "blip_caption"
|
|
eval:
|
|
name: "blip_caption"
|
|
|