File size: 4,456 Bytes
0f079b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
exp_root_dir: "outputs"
name: "image-to-shape-diffusion/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6"
tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+noise${data.noise_sigma}+pfeat${system.shape_model.point_feats}+normemb${system.condition_model.normalize_embeds}+lr${system.optimizer.args.lr}+qkvbias${system.shape_model.qkv_bias}+nfreq${system.shape_model.num_freqs}+ln_post${system.shape_model.use_ln_post},_}"
seed: 0

data_type: "objaverse-datamodule"
data:
  root_dir: "data/objaverse_clean/cap3d_high_quality_170k_images"
  data_type: "occupancy" 
  n_samples: 4096
  noise_sigma: 0.
  
  load_supervision: False
  supervision_type: "occupancy" 
  n_supervision: 4096

  load_image: True              # whether to load images 
  image_data_path: data/objaverse_clean/raw_data/images/cap3d_high_quality_170k
  image_type: "mvrgb"           # rgb, normal, mvrgb, mvnormal
  idx: [0, 4, 8, 12, 16] 
  n_views: 4
  load_caption: False           # whether to load captions
  rotate_points: False

  batch_size: 32
  num_workers: 16

system_type: "shape-diffusion-system"
system:
  val_samples_json: "val_data/mv_images/val_samples_rgb_mvimage.json"
  z_scale_factor: 1.0
  guidance_scale: 7.5
  num_inference_steps: 50
  eta: 0.0

  shape_model_type: "michelangelo-autoencoder"
  shape_model:
    # pretrained_model_name_or_path: ./ckpts/3DNativeGeneration/michelangelo-aligned-autoencoder-l256-e64-ne8-nd16.ckpt
    pretrained_model_name_or_path: "./outputs/image-to-shape-diffusion_bak/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6/michelangelo-autoencoder+n4096+noise0.0+pfeat3+normembFalse+lr5e-05+qkvbiasFalse+nfreq8+ln_postTrue/ckpts/last.ckpt"
    num_latents: 256
    embed_dim: 64
    point_feats: 3   # xyz + normal
    out_dim: 1 # only occupancy
    num_freqs: 8
    include_pi: false
    heads: 12
    width: 768
    num_encoder_layers: 8
    num_decoder_layers: 16
    use_ln_post: true
    init_scale: 0.25
    qkv_bias: false
    use_flash: true
    use_checkpoint: true

  condition_model_type: "clip-embedder"
  condition_model:
    pretrained_model_name_or_path: "./ckpts/pretrained_weights/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff"
    encode_camera: true
    camera_embeds_dim: 32 # 16 * 2[sin, cos]
    n_views: ${data.n_views}
    empty_embeds_ratio: 0.1
    normalize_embeds: false
    # zero_uncond_embeds: true
    zero_uncond_embeds: false

  denoiser_model_type: "simple-denoiser"
  denoiser_model:
    # pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6.pth"
    pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6-It500000.pth"
    input_channels: ${system.shape_model.embed_dim}
    output_channels:  ${system.shape_model.embed_dim}
    n_ctx: ${system.shape_model.num_latents}
    width: 768
    layers: 6   # 2 * 6 + 1 = 13
    heads: 12
    context_dim: 1024
    init_scale: 1.0
    skip_ln: true
    use_checkpoint: true

  noise_scheduler_type: "diffusers.schedulers.DDPMScheduler"
  noise_scheduler:
    num_train_timesteps: 1000
    beta_start: 0.00085
    beta_end: 0.012
    beta_schedule: "scaled_linear"
    variance_type: "fixed_small"
    clip_sample: false

  denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler"
  denoise_scheduler:
    num_train_timesteps: 1000
    beta_start: 0.00085
    beta_end: 0.012
    beta_schedule: "scaled_linear"
    clip_sample: false   # clip sample to -1~1
    set_alpha_to_one: false
    steps_offset: 1

  loggers:
    wandb:
      enable: false
      project: "CraftsMan"
      name: image-to-shape-diffusion+${name}+${tag}

  loss:
    loss_type: "mse"
    lambda_diffusion: 1.

  optimizer:
    name: AdamW
    args:
      lr: 5.e-5
      betas: [0.9, 0.99]
      eps: 1.e-6

  scheduler:
    name: SequentialLR
    interval: step
    schedulers:
      - name: LinearLR
        interval: step
        args:
          start_factor: 1e-6
          end_factor: 1.0
          total_iters: 5000
      - name: CosineAnnealingLR
        interval: step
        args:
          T_max: 5000
          eta_min: 0.
    milestones: [5000]

trainer:
  num_nodes: 1
  max_epochs: 100000
  log_every_n_steps: 5
  num_sanity_val_steps: 1
  check_val_every_n_epoch: 3
  enable_progress_bar: true
  precision: 16-mixed
  strategy: 'ddp_find_unused_parameters_true'

checkpoint:
  save_last: true
  save_top_k: -1
  every_n_train_steps: 5000