# ========= system global ========== # models: nextgpt: model_name: NextGPTModel agent_name: DeepSpeedAgent seed: 13 max_length: 512 # max length of the user input prompt logging_step: 5 num_clip_tokens: 77 gen_emb_dim: 768 pretrained_ckpt_path: ../ckpt/pretrained_ckpt/ # ========= LLM ========== # vicuna_version: 7b_v0 # [7b_v0, ] # ========= multimodal encoder ========== # imagebind_version: huge # ========= text-to-image alignment tuning ========== # n_img_tokens: 4 text_emb_to_img_layers: [-1] num_gen_img_tokens: 4 text_fc_to_img_mode: transformer # [qformer, transformer] # ========= text-to-video alignment tuning ========== # n_video_tokens: 24 text_emb_to_video_layers: [-1] num_gen_video_tokens: 24 text_fc_to_video_mode: transformer # [qformer, transformer] # ========= text-to-audio alignment tuning ========== # n_audio_tokens: 8 text_emb_to_audio_layers: [-1] num_gen_audio_tokens: 8 text_fc_to_audio_mode: transformer # [qformer, transformer] # ========= image diffusion model ========== # image_diffusion: runwayml/stable-diffusion-v1-5 # [runwayml/stable-diffusion-v1-5, stabilityai/stable-diffusion-2] # ========= video diffusion model ========== # video_diffusion: cerspense/zeroscope_v2_576w # ========= audio diffusion model ========== # audio_diffusion: cvssp/audioldm-l-full # [cvssp/audioldm-l-full, cvssp/audioldm-s-full-v2]