vidxtend / config.json

Upload config.json with huggingface_hub

b427ef8 verified 9 months ago

5.93 kB

	{
	"vae": {
	"_class_name": "AutoencoderKL",
	"_diffusers_version": "0.26.1",
	"_name_or_path": "damo-vilab/text-to-video-ms-1.7b",
	"act_fn": "silu",
	"block_out_channels": [
	128,
	256,
	512,
	512
	],
	"down_block_types": [
	"DownEncoderBlock2D",
	"DownEncoderBlock2D",
	"DownEncoderBlock2D",
	"DownEncoderBlock2D"
	],
	"force_upcast": true,
	"in_channels": 3,
	"latent_channels": 4,
	"layers_per_block": 2,
	"norm_num_groups": 32,
	"out_channels": 3,
	"sample_size": 512,
	"scaling_factor": 0.18215,
	"up_block_types": [
	"UpDecoderBlock2D",
	"UpDecoderBlock2D",
	"UpDecoderBlock2D",
	"UpDecoderBlock2D"
	]
	},
	"unet": {
	"_class_name": "UNet3DConditionModel",
	"_diffusers_version": "0.26.1",
	"_name_or_path": "damo-vilab/text-to-video-ms-1.7b",
	"act_fn": "silu",
	"attention_head_dim": 64,
	"block_out_channels": [
	320,
	640,
	1280,
	1280
	],
	"concat": false,
	"cross_attention_dim": 1024,
	"down_block_types": [
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"DownBlock3D"
	],
	"downsample_padding": 1,
	"in_channels": 4,
	"layers_per_block": 2,
	"merging_mode": "attention_cross_attention",
	"mid_block_scale_factor": 1,
	"norm_eps": 0.00001,
	"norm_num_groups": 32,
	"out_channels": 4,
	"sample_size": 32,
	"up_block_types": [
	"UpBlock3D",
	"CrossAttnUpBlock3D",
	"CrossAttnUpBlock3D",
	"CrossAttnUpBlock3D"
	],
	"use_channel_expansion": false,
	"use_fps_conditioning": false,
	"use_image_embedding": true,
	"use_image_tokens": true,
	"use_repeat_context_img": true
	},
	"resampler": {
	"_class_name": "ImageEmbeddingContextResampler",
	"_diffusers_version": "0.26.1",
	"cross_attention_dim": 1024,
	"expansion_factor": 16,
	"inner_dim": 1280
	},
	"controlnet": {
	"_class_name": "ControlNetModel",
	"_diffusers_version": "0.26.1",
	"act_fn": "silu",
	"attention_head_dim": 64,
	"block_out_channels": [
	320,
	640,
	1280,
	1280
	],
	"class_embed_type": null,
	"conditioning_embedding_out_channels": [
	32,
	96,
	256,
	512
	],
	"controlnet_conditioning_channel_order": "rgb",
	"cross_attention_dim": 1024,
	"down_block_types": [
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"DownBlock3D"
	],
	"downsample_controlnet_cond": true,
	"downsample_padding": 1,
	"flip_sin_to_cos": true,
	"frame_expansion": "none",
	"freq_shift": 0,
	"global_pool_conditions": false,
	"in_channels": 4,
	"layers_per_block": 2,
	"merging_mode": "addition",
	"mid_block_scale_factor": 1,
	"norm_eps": 0.00001,
	"norm_num_groups": 32,
	"num_class_embeds": null,
	"num_frames": 8,
	"num_frames_conditioning": 8,
	"num_tranformers": 1,
	"only_cross_attention": false,
	"projection_class_embeddings_input_dim": null,
	"resnet_time_scale_shift": "default",
	"upcast_attention": false,
	"use_controlnet_mask": false,
	"use_image_embedding": false,
	"use_image_encoder_normalization": false,
	"use_image_tokens": false,
	"use_linear_projection": false,
	"use_repeat_context_img": true,
	"zero_conv_mode": "Identity"
	},
	"text_encoder": {
	"_name_or_path": "damo-vilab/text-to-video-ms-1.7b",
	"architectures": [
	"CLIPTextModel"
	],
	"attention_dropout": 0,
	"bos_token_id": 0,
	"dropout": 0,
	"eos_token_id": 2,
	"hidden_act": "gelu",
	"hidden_size": 1024,
	"initializer_factor": 1,
	"initializer_range": 0.02,
	"intermediate_size": 4096,
	"layer_norm_eps": 0.00001,
	"max_position_embeddings": 77,
	"model_type": "clip_text_model",
	"num_attention_heads": 16,
	"num_hidden_layers": 23,
	"pad_token_id": 1,
	"projection_dim": 512,
	"torch_dtype": "float32",
	"transformers_version": "4.39.0",
	"vocab_size": 49408
	},
	"tokenizer": {
	"model": "ali-vilab/text-to-video-ms-1.7b",
	"subfolder": "tokenizer"
	},
	"scheduler": {
	"_class_name": "DDIMScheduler",
	"_diffusers_version": "0.26.1",
	"beta_end": 0.012,
	"beta_schedule": "scaled_linear",
	"beta_start": 0.00085,
	"clip_sample": false,
	"clip_sample_range": 1,
	"dynamic_thresholding_ratio": 0.995,
	"num_train_timesteps": 1000,
	"prediction_type": "epsilon",
	"rescale_betas_zero_snr": false,
	"sample_max_value": 1,
	"set_alpha_to_one": false,
	"skip_prk_steps": true,
	"steps_offset": 1,
	"thresholding": false,
	"timestep_spacing": "leading",
	"trained_betas": null
	},
	"num_frames": 16,
	"num_frames_conditioning": 8,
	"temp_attend_on_uncond_include_past": false,
	"temp_attend_on_neighborhood_of_condition_frames": false,
	"temporal_self_attention_mask_included_itself": false,
	"temporal_self_attention_only_on_conditioning": false,
	"spatial_attend_on_condition_frames": false,
	"image_encoder_version": "laion2b_s32b_b79k"
	}