| model: | |
| network: | |
| dim: 512 | |
| num_timesteps: 1000 | |
| depth: 12 | |
| dim_head: 64 | |
| heads: 12 | |
| diffusion: | |
| image_embed_dim: ${model.network.dim} | |
| timesteps: ${model.network.num_timesteps} | |
| cond_drop_prob: 0.2 | |
| image_embed_scale: 1.0 | |
| text_embed_scale: 1.0 | |
| beta_schedule: cosine | |
| predict_x_start: true | |
| data: | |
| bs: 512 | |
| format: webdataset | |
| path: data/webdataset/sg2-ffhq-1024-clip/{00000..99}.tar | |
| embed_noise_scale: 1.0 | |
| sg_pkl: https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-1024x1024.pkl | |
| clip_variant: ViT-B/32 | |
| n_latents: 1 | |
| latent_dim: 512 | |
| latent_repeats: | |
| - 18 | |
| val_im_samples: 64 | |
| val_text_samples: data/text/face-val.txt | |
| val_samples_per_text: 4 | |
| wandb_project: clip2latent | |
| wandb_entity: null | |
| name: baseline_noise_1 | |
| device: cuda:0 | |
| train: | |
| znorm_embed: false | |
| znorm_latent: true | |
| max_it: 1000000 | |
| val_it: 10000 | |
| lr: 0.0001 | |
| weight_decay: 0.01 | |
| ema_update_every: 1 | |
| ema_beta: 0.99999 | |