Spaces:

MykolaL
/

evp

Running on A10G

evp / stable-diffusion /configs /latent-diffusion /cin-ldm-vq-f8.yaml

nick_93

init

2a7f680 11 months ago

2.36 kB

	model:
	base_learning_rate: 1.0e-06
	target: ldm.models.diffusion.ddpm.LatentDiffusion
	params:
	linear_start: 0.0015
	linear_end: 0.0195
	num_timesteps_cond: 1
	log_every_t: 200
	timesteps: 1000
	first_stage_key: image
	cond_stage_key: class_label
	image_size: 32
	channels: 4
	cond_stage_trainable: true
	conditioning_key: crossattn
	monitor: val/loss_simple_ema
	unet_config:
	target: ldm.modules.diffusionmodules.openaimodel.UNetModel
	params:
	image_size: 32
	in_channels: 4
	out_channels: 4
	model_channels: 256
	attention_resolutions:
	#note: this isn\t actually the resolution but
	# the downsampling factor, i.e. this corresnponds to
	# attention on spatial resolution 8,16,32, as the
	# spatial reolution of the latents is 32 for f8
	- 4
	- 2
	- 1
	num_res_blocks: 2
	channel_mult:
	- 1
	- 2
	- 4
	num_head_channels: 32
	use_spatial_transformer: true
	transformer_depth: 1
	context_dim: 512
	first_stage_config:
	target: ldm.models.autoencoder.VQModelInterface
	params:
	embed_dim: 4
	n_embed: 16384
	ckpt_path: configs/first_stage_models/vq-f8/model.yaml
	ddconfig:
	double_z: false
	z_channels: 4
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult:
	- 1
	- 2
	- 2
	- 4
	num_res_blocks: 2
	attn_resolutions:
	- 32
	dropout: 0.0
	lossconfig:
	target: torch.nn.Identity
	cond_stage_config:
	target: ldm.modules.encoders.modules.ClassEmbedder
	params:
	embed_dim: 512
	key: class_label
	data:
	target: main.DataModuleFromConfig
	params:
	batch_size: 64
	num_workers: 12
	wrap: false
	train:
	target: ldm.data.imagenet.ImageNetTrain
	params:
	config:
	size: 256
	validation:
	target: ldm.data.imagenet.ImageNetValidation
	params:
	config:
	size: 256


	lightning:
	callbacks:
	image_logger:
	target: main.ImageLogger
	params:
	batch_frequency: 5000
	max_images: 8
	increase_log_steps: False

	trainer:
	benchmark: True