exp_root_dir: "outputs" name: "image-to-shape-diffusion/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6" tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+noise${data.noise_sigma}+pfeat${system.shape_model.point_feats}+normemb${system.condition_model.normalize_embeds}+lr${system.optimizer.args.lr}+qkvbias${system.shape_model.qkv_bias}+nfreq${system.shape_model.num_freqs}+ln_post${system.shape_model.use_ln_post},_}" seed: 0 data_type: "objaverse-datamodule" data: root_dir: "data/objaverse_clean/cap3d_high_quality_170k_images" data_type: "occupancy" n_samples: 4096 noise_sigma: 0. load_supervision: False supervision_type: "occupancy" n_supervision: 4096 load_image: True # whether to load images image_data_path: data/objaverse_clean/raw_data/images/cap3d_high_quality_170k image_type: "mvrgb" # rgb, normal, mvrgb, mvnormal idx: [0, 4, 8, 12, 16] n_views: 4 load_caption: False # whether to load captions rotate_points: False batch_size: 32 num_workers: 16 system_type: "shape-diffusion-system" system: val_samples_json: "val_data/mv_images/val_samples_rgb_mvimage.json" z_scale_factor: 1.0 guidance_scale: 7.5 num_inference_steps: 50 eta: 0.0 shape_model_type: "michelangelo-autoencoder" shape_model: # pretrained_model_name_or_path: ./ckpts/3DNativeGeneration/michelangelo-aligned-autoencoder-l256-e64-ne8-nd16.ckpt pretrained_model_name_or_path: "./outputs/image-to-shape-diffusion_bak/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6/michelangelo-autoencoder+n4096+noise0.0+pfeat3+normembFalse+lr5e-05+qkvbiasFalse+nfreq8+ln_postTrue/ckpts/last.ckpt" num_latents: 256 embed_dim: 64 point_feats: 3 # xyz + normal out_dim: 1 # only occupancy num_freqs: 8 include_pi: false heads: 12 width: 768 num_encoder_layers: 8 num_decoder_layers: 16 use_ln_post: true init_scale: 0.25 qkv_bias: false use_flash: true use_checkpoint: true condition_model_type: "clip-embedder" condition_model: pretrained_model_name_or_path: "./ckpts/pretrained_weights/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff" encode_camera: true camera_embeds_dim: 32 # 16 * 2[sin, cos] n_views: ${data.n_views} empty_embeds_ratio: 0.1 normalize_embeds: false # zero_uncond_embeds: true zero_uncond_embeds: false denoiser_model_type: "simple-denoiser" denoiser_model: # pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6.pth" pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6-It500000.pth" input_channels: ${system.shape_model.embed_dim} output_channels: ${system.shape_model.embed_dim} n_ctx: ${system.shape_model.num_latents} width: 768 layers: 6 # 2 * 6 + 1 = 13 heads: 12 context_dim: 1024 init_scale: 1.0 skip_ln: true use_checkpoint: true noise_scheduler_type: "diffusers.schedulers.DDPMScheduler" noise_scheduler: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "scaled_linear" variance_type: "fixed_small" clip_sample: false denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler" denoise_scheduler: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "scaled_linear" clip_sample: false # clip sample to -1~1 set_alpha_to_one: false steps_offset: 1 loggers: wandb: enable: false project: "CraftsMan" name: image-to-shape-diffusion+${name}+${tag} loss: loss_type: "mse" lambda_diffusion: 1. optimizer: name: AdamW args: lr: 5.e-5 betas: [0.9, 0.99] eps: 1.e-6 scheduler: name: SequentialLR interval: step schedulers: - name: LinearLR interval: step args: start_factor: 1e-6 end_factor: 1.0 total_iters: 5000 - name: CosineAnnealingLR interval: step args: T_max: 5000 eta_min: 0. milestones: [5000] trainer: num_nodes: 1 max_epochs: 100000 log_every_n_steps: 5 num_sanity_val_steps: 1 check_val_every_n_epoch: 3 enable_progress_bar: true precision: 16-mixed strategy: 'ddp_find_unused_parameters_true' checkpoint: save_last: true save_top_k: -1 every_n_train_steps: 5000