|
set -x |
|
|
|
lpips_lambda=0.8 |
|
|
|
image_size=128 |
|
image_size_encoder=256 |
|
|
|
patch_size=14 |
|
|
|
|
|
|
|
|
|
batch_size=4 |
|
microbatch=${batch_size} |
|
|
|
num_samples=$((50/${batch_size})) |
|
|
|
cfg_dropout_prob=0.1 |
|
|
|
unconditional_guidance_scale=6.5 |
|
|
|
num_workers=0 |
|
|
|
eval_data_dir="NONE" |
|
shards_lst=/cpfs01/user/lanyushi.p/Repo/diffusion-3d/shell_scripts/baselines/reconstruction/sr/final_mv/diff_shards_lst_ani.txt |
|
eval_shards_lst="/cpfs01/user/lanyushi.p/Repo/diffusion-3d/shell_scripts/baselines/reconstruction/sr/final_mv/shards_animals_lst.txt" |
|
|
|
data_dir="NONE" |
|
DATASET_FLAGS=" |
|
--data_dir ${data_dir} \ |
|
--eval_shards_lst ${eval_shards_lst} \ |
|
--shards_lst ${shards_lst} \ |
|
" |
|
|
|
lr=2e-5 |
|
kl_lambda=0 |
|
vit_lr=1e-5 |
|
ce_lambda=0.5 |
|
conv_lr=5e-5 |
|
alpha_lambda=1 |
|
scale_clip_encoding=1 |
|
|
|
triplane_scaling_divider=0.88 |
|
|
|
|
|
prompt="A sailboat with mast." |
|
|
|
|
|
|
|
LR_FLAGS="--encoder_lr $vit_lr \ |
|
--vit_decoder_lr $vit_lr \ |
|
--lpips_lambda $lpips_lambda \ |
|
--triplane_decoder_lr $conv_lr \ |
|
--super_resolution_lr $conv_lr \ |
|
--lr $lr \ |
|
--kl_lambda ${kl_lambda} \ |
|
--bg_lamdba 0.01 \ |
|
--alpha_lambda ${alpha_lambda} \ |
|
" |
|
|
|
TRAIN_FLAGS="--iterations 10001 --anneal_lr False \ |
|
--batch_size $batch_size --save_interval 10000 \ |
|
--microbatch ${microbatch} \ |
|
--image_size_encoder $image_size_encoder \ |
|
--image_size $image_size \ |
|
--dino_version mv-sd-dit \ |
|
--sr_training False \ |
|
--encoder_cls_token False \ |
|
--decoder_cls_token False \ |
|
--cls_token False \ |
|
--weight_decay 0.05 \ |
|
--no_dim_up_mlp True \ |
|
--uvit_skip_encoder True \ |
|
--decoder_load_pretrained False \ |
|
--fg_mse False \ |
|
--vae_p 2 \ |
|
--plucker_embedding True \ |
|
--encoder_in_channels 9 \ |
|
--arch_dit_decoder DiT2-B/2 \ |
|
--sd_E_ch 64 \ |
|
--sd_E_num_res_blocks 1 \ |
|
--lrm_decoder False \ |
|
--resume_checkpoint /home/yslan/Repo/open-source/data/model_joint_denoise_rec_model2310000.pt \ |
|
" |
|
|
|
|
|
|
|
DDPM_MODEL_FLAGS=" |
|
--learn_sigma False \ |
|
--num_heads 8 \ |
|
--num_res_blocks 2 \ |
|
--num_channels 320 \ |
|
--attention_resolutions "4,2,1" \ |
|
--use_spatial_transformer True \ |
|
--transformer_depth 1 \ |
|
--context_dim 768 \ |
|
" |
|
|
|
|
|
|
|
|
|
|
|
DIFFUSION_FLAGS="--diffusion_steps 1000 --noise_schedule linear \ |
|
--use_kl False \ |
|
--use_amp False \ |
|
--triplane_scaling_divider ${triplane_scaling_divider} \ |
|
--trainer_name vpsde_crossattn_objv \ |
|
--mixed_prediction False \ |
|
--train_vae False \ |
|
--denoise_in_channels 4 \ |
|
--denoise_out_channels 4 \ |
|
--diffusion_input_size 32 \ |
|
--diffusion_ce_anneal True \ |
|
--create_controlnet False \ |
|
--p_rendering_loss False \ |
|
--pred_type v \ |
|
--predict_v True \ |
|
--create_dit False \ |
|
--train_vae False \ |
|
--use_eos_feature False \ |
|
--roll_out True \ |
|
" |
|
|
|
DDIM_FLAGS=" |
|
--timestep_respacing ddim250 \ |
|
--use_ddim True \ |
|
--unconditional_guidance_scale ${unconditional_guidance_scale} \ |
|
" |
|
|
|
|
|
logdir=./logs/LSGM/inference/t23d/Objaverse/cfg=${unconditional_guidance_scale}/fixing-DDIM/231w/mast3 |
|
|
|
SR_TRAIN_FLAGS_v1_2XC=" |
|
--decoder_in_chans 32 \ |
|
--out_chans 96 \ |
|
--ae_classname vit.vit_triplane.RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder \ |
|
--logdir $logdir \ |
|
--arch_encoder vits \ |
|
--arch_decoder vitb \ |
|
--vit_decoder_wd 0.001 \ |
|
--encoder_weight_decay 0.001 \ |
|
--color_criterion mse \ |
|
--triplane_in_chans 32 \ |
|
--decoder_output_dim 3 \ |
|
" |
|
|
|
|
|
|
|
|
|
SR_TRAIN_FLAGS=${SR_TRAIN_FLAGS_v1_2XC} |
|
|
|
NUM_GPUS=1 |
|
|
|
rm -rf "$logdir"/runs |
|
mkdir -p "$logdir"/ |
|
cp "$0" "$logdir"/ |
|
|
|
export OMP_NUM_THREADS=12 |
|
export NCCL_ASYNC_ERROR_HANDLING=1 |
|
export OPENCV_IO_ENABLE_OPENEXR=1 |
|
export NCCL_IB_GID_INDEX=3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0 |
|
|
|
|
|
|
|
|
|
torchrun --nproc_per_node=$NUM_GPUS \ |
|
--nnodes 1 \ |
|
--rdzv-endpoint=localhost:24369 \ |
|
scripts/vit_triplane_diffusion_sample_objaverse.py \ |
|
--num_workers ${num_workers} \ |
|
--eval_data_dir $eval_data_dir \ |
|
--depth_lambda 0 \ |
|
${TRAIN_FLAGS} \ |
|
${SR_TRAIN_FLAGS} \ |
|
${DATASET_FLAGS} \ |
|
${DIFFUSION_FLAGS} \ |
|
${DDPM_MODEL_FLAGS} \ |
|
${DDIM_FLAGS} \ |
|
--overfitting False \ |
|
--load_pretrain_encoder False \ |
|
--iterations 5000001 \ |
|
--save_interval 10000 \ |
|
--eval_interval 5000 \ |
|
--decomposed True \ |
|
--logdir $logdir \ |
|
--cfg objverse_tuneray_aug_resolution_64_64_auto \ |
|
--patch_size ${patch_size} \ |
|
--eval_batch_size 1 \ |
|
${LR_FLAGS} \ |
|
--ce_lambda ${ce_lambda} \ |
|
--negative_entropy_lambda ${ce_lambda} \ |
|
--triplane_fg_bg False \ |
|
--grad_clip True \ |
|
--interval 5 \ |
|
--normalize_clip_encoding True \ |
|
--scale_clip_encoding ${scale_clip_encoding} \ |
|
--objv_dataset True \ |
|
--cfg_dropout_prob ${cfg_dropout_prob} \ |
|
--cond_key caption \ |
|
--enable_mixing_normal False \ |
|
--use_lmdb_compressed False \ |
|
--use_lmdb False \ |
|
--load_wds_diff True \ |
|
--mv_input True \ |
|
--compile False \ |
|
--prompt "$prompt" \ |
|
--num_samples ${num_samples} \ |
|
--use_wds False \ |