Image-to-3D
checkpoint
ReLi3D / config.yaml
sai-dev
Add ReLi3D model files with updated model card
4dc0fef
system:
preprocessor:
- cls: src.models.pre_processor.random_conditioning_selector.RandomViewElementConditioningSelector
kwargs:
min_condition_count: 1
max_condition_count: 4
training_only: true
- cls: src.models.pre_processor.camera.LinearCameraEmbedder
kwargs:
in_channels: 25
out_channels: 1024
conditions:
- camera-to-world_cond
- intrinsics-normed_cond
tokenizer:
- cls: src.models.tokenizers.image.DINOV2SingleImageTokenizer
kwargs:
pretrained_model_name_or_path: facebook/dinov2-large
width: 512
height: 512
freeze_backbone_params: false
enable_memory_efficient_attention: true
enable_gradient_checkpointing: true
modulation_key: camera-embedding
modulation_zero_init: true
modulation_single_layer: true
modulation_cond_dim: 1024
is_cross_attention_tokenizer: true
append_conditioning: true
image_key: image_bg_cond
- cls: src.models.tokenizers.triplane.TriplaneLearnablePositionalEmbedding
kwargs:
plane_size: 96
num_channels: 1024
is_output_tokenizer: true
is_input_tokenizer: true
tokenize_key: triplane
detokenize_key: triplane
backbone_cls: src.models.transformers.twostream_interleave.TwoStreamInterleaveTransformer
backbone:
raw_triplane_channels: 1024
triplane_channels: 1024
num_attention_heads: 16
attention_head_dim: 64
raw_image_channels: 1024
num_latents: 1792
num_blocks: 4
num_basic_blocks: 3
dropout: 0.0
latent_init_std: 0.02
triplane_attention: false
triplane_resolution: 96
triplane_full_attention: true
gradient_checkpointing: true
mix_latent: true
mix_latent_max_tokens: 1298
num_experts: 8
start_experts_from_block: 2
output_key: triplane
postprocessor:
- cls: src.models.networks.PixelShuffleUpsampleNetwork
kwargs:
in_channels: 1024
out_channels: 40
scale_factor: 4
conv_layers: 4
- cls: src.models.post_processor.transformer_post_processor.TransformerPostProcessor
kwargs:
tokenizer:
- cls: src.models.tokenizers.multi_input_wrapper.RandomMaskTokenizerWrapper
kwargs:
is_cross_attention_tokenizer: true
image_key: image_cond
mask_key: opacity_cond
dropout_prob: 0.5
tokenizer_cls: src.models.tokenizers.image.DINOV2SingleImageTokenizer
tokenizer:
pretrained_model_name_or_path: facebook/dinov2-small
width: 512
height: 512
freeze_backbone_params: false
enable_memory_efficient_attention: true
enable_gradient_checkpointing: true
modulation_key: camera-embedding
modulation_zero_init: true
modulation_single_layer: true
modulation_cond_dim: 1024
is_cross_attention_tokenizer: true
append_conditioning: true
extra_input_key: opacity_cond
extra_input_dim: 1
- cls: src.models.tokenizers.vector_proj.LearnableTokenBank
kwargs:
tokenize_key: token_bank
is_input_tokenizer: true
token_count: 78
token_dim: 256
transpose: true
- cls: src.models.tokenizers.triplane.SimpleTriplaneTokenizer
kwargs:
is_cross_attention_tokenizer: true
input_dimension: 1024
output_dimension: 384
input_strategy: token_concat
cross_attention_strategy: token_concat
backbone_cls: src.models.transformers.transformer_1d.Transformer1D
backbone:
in_channels: 256
out_channels: 1
norm_num_groups: 16
num_attention_heads: 16
attention_head_dim: 64
cross_attention_dim: 384
num_layers: 4
norm_type: layer_norm
enable_memory_efficient_attention: true
gradient_checkpointing: true
output_key: token_output
postprocessor:
- cls: src.models.post_processor.latent_unpacker.LatentUnpacker
kwargs:
keys:
- reni-latent
- illumination-strength
- illumination-rotation_repr
unpack_key: token_output
unpack_shape:
- -1
shapes:
- 49, 3
- 1,
- 6,
out_bias:
- 0.0
- 1.0
- 0.0
- cls: src.models.pre_processor.multiview_geometry.RepresentationToRotationMatrix
kwargs:
in_key: illumination-rotation_repr
out_key: illumination-rotation
output_keys:
- illumination-rotation
- illumination-rotation_repr
- illumination-strength
- reni-latent
- cls: src.models.post_processor.copy_renamer.CopyRenamer
kwargs:
key_in: illumination-z-rotation-rads_cond
key_out: illumination-z-rotation-rads
- cls: src.models.pre_processor.reni_latent_to_env.ReniLatentToEnvProcessor
kwargs:
reni_env_config:
reni_config:
weights: load/reni/reni++L49.safetensors
axis_of_invariance: z
conditioning: Attention
encoded_input: Directions
equivariance: SO2
first_omega_0: 30.0
fixed_decoder: true
hidden_features: 128
hidden_layers: 9
hidden_omega_0: 30.0
invariant_function: VN
last_layer_linear: true
latent_dim: 49
mapping_features: 128
mapping_layers: 5
num_attention_heads: 8
num_attention_layers: 6
old_implementation: false
out_features: 3
output_activation: exp
positional_encoding: NeRF
parametrization: spherical
resolution: 64
material_cls: src.models.materials.multiple_importance_sampling_material.MultipleImportanceMonteCarloEnvironmentShader
material:
sampling_stategies:
- cls: src.models.materials.monte_carlo_samplers.illumination.PiecewiseDistributionEnvironmentSkySampler
kwargs:
num_samples: 20
- cls: src.models.materials.monte_carlo_samplers.material.GGXVNDFAntitheticMaterialSampler
kwargs:
num_samples: 40
perceptual_roughness: false
- cls: src.models.materials.monte_carlo_samplers.material.CosineHemisphereMaterialSampler
kwargs:
num_samples: 4
perceptual_roughness: false
normal_type: radial_bump
radial_up_axis: 'y'
tone_mapping_cls: src.utils.tonemapping.AgXToneMapping
tone_mapping:
color_space_type: src.utils.color_space.LinearToSRGBColorSpaceConversion
sampler: halton
sample_rotation: true
radiance_clamping_upper_limit: 20.0
use_power_heuristic: false
background_cls: src.models.background.solid_color_background.SolidColorBackground
background:
color:
- 0.0
- 0.0
- 0.0
object_representation_cls: src.models.object_representations.volumetric.triplane_representation.VolumetricTriplaneRepresentation
object_representation:
multi_head_mlp:
only_heads: true
n_neurons: 64
activation: silu
heads:
- name: density
out_channels: 1
n_hidden_layers: 2
- name: basecolor
out_channels: 3
n_hidden_layers: 3
output_activation: sigmoid
- name: surface-normal
out_channels: 3
n_hidden_layers: 3
output_activation: normalize_channel_last
init_weights: normal_/0/0.01
init_bias: constant_/0.0
out_bias:
- 0.0
- 0.0
- 1.0
- name: vertex-offset
out_channels: 3
n_hidden_layers: 2
init_weights: normal_/0/0.01
init_bias: constant_/0.0
- name: roughness
out_channels: 1
n_hidden_layers: 2
output_activation: sigmoid/0.1/1.0
- name: metallic
out_channels: 1
n_hidden_layers: 2
output_activation: sigmoid
- name: flexicubes-weight
out_channels: 16
n_hidden_layers: 1
output_activation: linear
isosurface_resolution: 80
isosurface_threshold: 10.0
isosurface_method: flexicubes
additional_indices_keys:
weight_n: flexicubes-weight
indices_merging_method: mlp
indices_merging_mlp:
only_heads: true
n_neurons: 64
activation: silu
in_channels: 16
heads:
- name: flexicubes-weight
out_channels: 21
n_hidden_layers: 2
output_activation: linear
out_multiplier: 0.1
init_weights: normal_/0/0.05
init_bias: constant_/0.0
flexicubes_weight_scale: 0.5
flexicubes_qef_reg_scale: 0.001
use_deformation: true
triplane_features: 40
radius: 0.87
feature_reduction: concat
shape_activation: trunc_exp
density_bias: -1.0
renderer_cls: src.models.renderers.volumetric_mesh_rasterizer.MeshRasterizer
renderer:
batch_size: 1
rasterizer: drtk