Fudan-FUXI
/

VIDGEN-v1.0

Model card Files Files and versions Community

VIDGEN-v1.0 / transformer /config.yaml

Fudan-FUXI's picture

Upload 2 files

69516ef verified 2 months ago

history blame contribute delete

1.59 kB

	model:
	type: PixArtVideo_XL_1x2x2
	space_scale: 0.5
	time_scale: 1.0
	mlp_type: "llama"
	#enable_rope: True
	position_embed_spaltial: "absolute"
	position_embed_temporal: "rope"

	norm_type: "llamarmsnorm"
	in_channels: 8 # to be consistent with videovae
	temp_window_size: [-1, 8, 8] # windown attn for temporal-attn
	adain_with_text: True
	qk_norm: False

	prob_text_condition: 1.0
	prob_img_condition: 0
	prob_img_condition_attn: 0

	class_dropout_prob: 0.1

	grad_checkpointing: True

	enable_frames_embedder: False
	enable_tgt_size_embedder: False

	clip_image_encoder: "pretrain_models/openai/clip-vit-large-patch14"

	vae:
	type: "CausualVAEVideo"
	# z=8
	config: "configs/vae_config.yaml"
	from_pretrained: "./pretrain_model/vidgen/vae/vae_pytorch_model.bin"


	text_encoder:
	type: "t5"
	from_pretrained: "pretrain_models/"
	model_max_length: 200
	shardformer: True


	diffusion:
	type: "IDDPM"
	snr: False
	train_sampling_steps: 1000
	prob_self_condition: 0
	v_predict: False


	optimizer:
	learning_rate: 1e-4
	weight_decay: 0
	eps: 1e-8
	min_lr_ratio: 0.95
	gradient_clip: 1.0

	num_frames_video: 17 # base frames of one video slice
	num_slice_for_long_video: -1 # how many 2s slice is the long video be split, -1 denotes dynamic

	resolution_video: -1
	resolution_image: -1
	mode_various_resolution: False

	precision: "bf16"
	seed: 42
	workers: 4
	grad_checkpoint: False
	gradient_accumulation_steps: 4
	logging_steps: 10