File size: 3,770 Bytes
2cdb96e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tyro
from dataclasses import dataclass
from typing import Tuple, Literal, Dict, Optional
@dataclass
class Options:
### model
# Unet image input size
input_size: int = 256
# Unet definition
down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024, 1024)
down_attention: Tuple[bool, ...] = (False, False, False, True, True, True)
mid_attention: bool = True
up_channels: Tuple[int, ...] = (1024, 1024, 512, 256)
up_attention: Tuple[bool, ...] = (True, True, True, False)
# Unet output size, dependent on the input_size and U-Net structure!
splat_size: int = 64
# gaussian render size
output_size: int = 256
### dataset
# data mode (only support s3 now)
data_mode: str = '4d'
# fovy of the dataset
fovy: float = 49.1
# camera near plane
znear: float = 0.5
# camera far plane
zfar: float = 2.5
# number of all views (input + output)
num_views: int = 12
# number of views
num_input_views: int = 4
# camera radius
cam_radius: float = 1.5 # to better use [-1, 1]^3 space
# num workers
num_workers: int = 16
datalist: str=''
### training
# workspace
workspace: str = './workspace'
# resume
resume: Optional[str] = None
# batch size (per-GPU)
batch_size: int = 8
# gradient accumulation
gradient_accumulation_steps: int = 1
# training epochs
num_epochs: int = 30
# lpips loss weight
lambda_lpips: float = 1.0
# gradient clip
gradient_clip: float = 1.0
# mixed precision
mixed_precision: str = 'bf16'
# learning rate
lr: float = 4e-4
# augmentation prob for grid distortion
prob_grid_distortion: float = 0.5
# augmentation prob for camera jitter
prob_cam_jitter: float = 0.5
# number of gaussians per pixel
gaussian_perpixel: int = 1
### testing
# test image path
test_path: Optional[str] = None
### misc
# nvdiffrast backend setting
force_cuda_rast: bool = False
# render fancy video with gaussian scaling effect
fancy_video: bool = False
# 4D
num_frames: int = 8
use_temp_attn: bool = True
shuffle_input: bool = True
# s3
sample_by_anim: bool = True
# interp
interpresume: Optional[str] = None
interpolate_rate: int = 3
# all the default settings
config_defaults: Dict[str, Options] = {}
config_doc: Dict[str, str] = {}
config_doc['lrm'] = 'the default settings for LGM'
config_defaults['lrm'] = Options()
config_doc['big'] = 'big model with higher resolution Gaussians'
config_defaults['big'] = Options(
input_size=256,
up_channels=(1024, 1024, 512, 256, 128), # one more decoder
up_attention=(True, True, True, False, False),
splat_size=128,
output_size=512, # render & supervise Gaussians at a higher resolution.
batch_size=1,
num_views=8,
gradient_accumulation_steps=1,
mixed_precision='bf16',
resume='pretrained/model_fp16_fixrot.safetensors',
)
AllConfigs = tyro.extras.subcommand_type_from_defaults(config_defaults, config_doc)
|