Commit
·
139824f
1
Parent(s):
b1acb64
add config
Browse files
configs/accelerate-train-1gpu.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
debug: false
|
3 |
+
distributed_type: MULTI_GPU
|
4 |
+
downcast_bf16: 'no'
|
5 |
+
gpu_ids: all
|
6 |
+
machine_rank: 0
|
7 |
+
main_training_function: main
|
8 |
+
mixed_precision: bf16
|
9 |
+
num_machines: 1
|
10 |
+
num_processes: 1
|
11 |
+
rdzv_backend: static
|
12 |
+
same_network: true
|
13 |
+
tpu_env: []
|
14 |
+
tpu_use_cluster: false
|
15 |
+
tpu_use_sudo: false
|
16 |
+
use_cpu: false
|
configs/accelerate-train-deepspeed.yaml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
debug: false
|
3 |
+
deepspeed_config:
|
4 |
+
gradient_accumulation_steps: 1
|
5 |
+
gradient_clipping: 1.0
|
6 |
+
offload_optimizer_device: none
|
7 |
+
offload_param_device: none
|
8 |
+
zero3_init_flag: false
|
9 |
+
zero_stage: 2
|
10 |
+
distributed_type: DEEPSPEED
|
11 |
+
downcast_bf16: 'no'
|
12 |
+
enable_cpu_affinity: false
|
13 |
+
machine_rank: 0
|
14 |
+
main_training_function: main
|
15 |
+
mixed_precision: bf16
|
16 |
+
num_machines: 1
|
17 |
+
num_processes: 8
|
18 |
+
rdzv_backend: static
|
19 |
+
same_network: true
|
20 |
+
tpu_env: []
|
21 |
+
tpu_use_cluster: false
|
22 |
+
tpu_use_sudo: false
|
23 |
+
use_cpu: false
|
configs/accelerate-train.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
debug: false
|
3 |
+
distributed_type: MULTI_GPU
|
4 |
+
downcast_bf16: 'no'
|
5 |
+
gpu_ids: all
|
6 |
+
machine_rank: 0
|
7 |
+
main_training_function: main
|
8 |
+
mixed_precision: bf16
|
9 |
+
num_machines: 1
|
10 |
+
num_processes: 8
|
11 |
+
rdzv_backend: static
|
12 |
+
same_network: true
|
13 |
+
tpu_env: []
|
14 |
+
tpu_use_cluster: false
|
15 |
+
tpu_use_sudo: false
|
16 |
+
use_cpu: false
|
configs/infer-gradio.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
source_size: 336
|
2 |
+
render_size: 288
|
3 |
+
render_views: 100
|
4 |
+
render_fps: 25
|
5 |
+
frame_size: 2
|
6 |
+
mesh_size: 384
|
7 |
+
mesh_thres: 3.0
|
configs/inference/human-lrm-1B.yaml
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LHM-1B
|
2 |
+
experiment:
|
3 |
+
type: lrm
|
4 |
+
seed: 42
|
5 |
+
parent: video_human_benchmark
|
6 |
+
child: human-lrm-1B
|
7 |
+
model:
|
8 |
+
# image encoder
|
9 |
+
model_name: SapDinoLRMBHSD3_5
|
10 |
+
encoder_type: dinov2_fusion
|
11 |
+
encoder_model_name: "dinov2_vitl14_reg"
|
12 |
+
encoder_feat_dim: 1024 # dinov2 embeding size 1024
|
13 |
+
encoder_freeze: False
|
14 |
+
|
15 |
+
fine_encoder_type: sapiens
|
16 |
+
fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2" # sapiens pretrained model path
|
17 |
+
fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
|
18 |
+
fine_encoder_freeze: True
|
19 |
+
|
20 |
+
|
21 |
+
use_face_id: True
|
22 |
+
|
23 |
+
# points embeddings
|
24 |
+
# num_pcl: 10240
|
25 |
+
latent_query_points_type: "e2e_smplx_sub1"
|
26 |
+
pcl_dim: 1024
|
27 |
+
|
28 |
+
facesr: True
|
29 |
+
|
30 |
+
|
31 |
+
# transformer
|
32 |
+
# # camera_embed_dim: 1024
|
33 |
+
# transformer_dim: 512
|
34 |
+
# transformer_layers: 12
|
35 |
+
# transformer_heads: 8
|
36 |
+
|
37 |
+
transformer_type: "sd3_mm_bh_cond" # multi-modal attention.
|
38 |
+
transformer_heads: 16 # 30
|
39 |
+
transformer_dim: 1024 # 30 * 64=1920
|
40 |
+
transformer_layers: 15 # 30
|
41 |
+
tf_grad_ckpt: true
|
42 |
+
encoder_grad_ckpt: true
|
43 |
+
|
44 |
+
# for gs renderer
|
45 |
+
human_model_path: "./pretrained_models/human_model_files"
|
46 |
+
smplx_subdivide_num: 1
|
47 |
+
smplx_type: "smplx_2"
|
48 |
+
gs_query_dim: 1024
|
49 |
+
gs_use_rgb: True
|
50 |
+
gs_sh: 3
|
51 |
+
dense_sample_pts: 40000 # 4,000
|
52 |
+
gs_mlp_network_config:
|
53 |
+
n_neurons: 512
|
54 |
+
n_hidden_layers: 2
|
55 |
+
activation: silu
|
56 |
+
# gs_xyz_offset_max_step: 0.05625 # 1.8 / 32
|
57 |
+
# gs_clip_scaling: 0.2 # avoid too large Sphere
|
58 |
+
gs_xyz_offset_max_step: 1. # 1.8 / 32
|
59 |
+
gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
|
60 |
+
expr_param_dim: 100
|
61 |
+
shape_param_dim: 10
|
62 |
+
|
63 |
+
fix_opacity: False
|
64 |
+
fix_rotation: False
|
65 |
+
cano_pose_type: 1 # 0 means exavatar-pose 1 indicates REC-MV pose
|
66 |
+
|
67 |
+
dataset:
|
68 |
+
subsets:
|
69 |
+
- name: video_human_flame
|
70 |
+
root_dirs: "./train_data/ClothVideo"
|
71 |
+
meta_path:
|
72 |
+
train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
|
73 |
+
val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
|
74 |
+
sample_rate: 1.0
|
75 |
+
use_flame: True
|
76 |
+
src_head_size: 112
|
77 |
+
- name: video_human_flame_v2
|
78 |
+
root_dirs: "./train_data/ClothVideo"
|
79 |
+
meta_path:
|
80 |
+
train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
|
81 |
+
val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
|
82 |
+
sample_rate: 1.0
|
83 |
+
use_flame: True
|
84 |
+
src_head_size: 112
|
85 |
+
sample_side_views: 5
|
86 |
+
source_image_res: 1024
|
87 |
+
src_head_size: 112
|
88 |
+
render_image:
|
89 |
+
low: 512
|
90 |
+
high: 512
|
91 |
+
region: null
|
92 |
+
num_train_workers: 4
|
93 |
+
multiply: 16 # dino features
|
94 |
+
num_val_workers: 2
|
95 |
+
pin_mem: true
|
96 |
+
repeat_num: 1
|
97 |
+
|
98 |
+
train:
|
99 |
+
mixed_precision: bf16 # REPLACE THIS BASED ON GPU TYPE
|
100 |
+
find_unused_parameters: false
|
101 |
+
loss_func:
|
102 |
+
pixel_loss: l1 # L1 or MSE
|
103 |
+
ball_loss:
|
104 |
+
type: heuristic # heuristic ball_loss
|
105 |
+
group:
|
106 |
+
head: 1.
|
107 |
+
lower_body: 100.
|
108 |
+
upper_body: 1000.
|
109 |
+
hands: 10000.
|
110 |
+
offset_loss:
|
111 |
+
type: classical # heuristic ball_loss
|
112 |
+
group:
|
113 |
+
head: 1.
|
114 |
+
lower_body: 1.
|
115 |
+
upper_body: 100.
|
116 |
+
hands: 1000.
|
117 |
+
loss:
|
118 |
+
pixel_weight: 0.0
|
119 |
+
masked_pixel_weight: 1.0
|
120 |
+
masked_head_weight: 0.0
|
121 |
+
perceptual_weight: 1.0
|
122 |
+
# tv_weight: 5e-4
|
123 |
+
tv_weight: -1
|
124 |
+
mask_weight: 1.0
|
125 |
+
face_id_weight: 0.05
|
126 |
+
asap_weight: 10.0 # ball loss
|
127 |
+
acap_weight: 1000.0 # offset loss
|
128 |
+
optim:
|
129 |
+
lr: 4e-5
|
130 |
+
weight_decay: 0.05
|
131 |
+
beta1: 0.9
|
132 |
+
beta2: 0.95
|
133 |
+
clip_grad_norm: 0.1 # diffusion model
|
134 |
+
scheduler:
|
135 |
+
type: cosine
|
136 |
+
warmup_real_iters: 0
|
137 |
+
batch_size: 2 # REPLACE THIS (PER GPU)
|
138 |
+
accum_steps: 1 # REPLACE THIS
|
139 |
+
epochs: 60 # REPLACE THIS
|
140 |
+
debug_global_steps: null
|
141 |
+
|
142 |
+
val:
|
143 |
+
batch_size: 2
|
144 |
+
global_step_period: 1000
|
145 |
+
debug_batches: 10
|
146 |
+
|
147 |
+
saver:
|
148 |
+
auto_resume: True
|
149 |
+
checkpoint_root: None
|
150 |
+
checkpoint_global_steps: 1000
|
151 |
+
checkpoint_keep_level: 60
|
152 |
+
|
153 |
+
logger:
|
154 |
+
stream_level: WARNING
|
155 |
+
log_level: INFO
|
156 |
+
log_root: ./exps/logs
|
157 |
+
tracker_root: ./exps/trackers
|
158 |
+
enable_profiler: false
|
159 |
+
trackers:
|
160 |
+
- tensorboard
|
161 |
+
image_monitor:
|
162 |
+
train_global_steps: 100
|
163 |
+
samples_per_log: 4
|
164 |
+
|
165 |
+
compile:
|
166 |
+
suppress_errors: true
|
167 |
+
print_specializations: true
|
168 |
+
disable: true
|
configs/inference/human-lrm-500M.yaml
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LHM-500M
|
2 |
+
experiment:
|
3 |
+
type: lrm
|
4 |
+
seed: 42
|
5 |
+
parent: video_human_benchmark
|
6 |
+
child: human-lrm-500M
|
7 |
+
model:
|
8 |
+
# image encoder
|
9 |
+
model_name: SapDinoLRMBHSD3_5
|
10 |
+
encoder_type: dinov2_fusion
|
11 |
+
encoder_model_name: "dinov2_vitl14_reg"
|
12 |
+
encoder_feat_dim: 1024 # dinov2 embeding size 1024
|
13 |
+
encoder_freeze: False
|
14 |
+
|
15 |
+
fine_encoder_type: sapiens
|
16 |
+
fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2" # sapiens pretrained model path
|
17 |
+
fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
|
18 |
+
fine_encoder_freeze: True
|
19 |
+
|
20 |
+
use_face_id: True
|
21 |
+
|
22 |
+
# points embeddings
|
23 |
+
# num_pcl: 10240
|
24 |
+
latent_query_points_type: "e2e_smplx_sub1"
|
25 |
+
pcl_dim: 1024
|
26 |
+
facesr: True
|
27 |
+
|
28 |
+
transformer_type: "sd3_mm_bh_cond" # multi-modal BH attention.
|
29 |
+
transformer_heads: 16 # 30
|
30 |
+
transformer_dim: 1024 # 30 * 64=1920
|
31 |
+
transformer_layers: 5 # 30
|
32 |
+
tf_grad_ckpt: true
|
33 |
+
encoder_grad_ckpt: true
|
34 |
+
|
35 |
+
# for gs renderer
|
36 |
+
human_model_path: "./pretrained_models/human_model_files"
|
37 |
+
smplx_subdivide_num: 1
|
38 |
+
smplx_type: "smplx_2"
|
39 |
+
gs_query_dim: 1024
|
40 |
+
gs_use_rgb: True
|
41 |
+
gs_sh: 3
|
42 |
+
dense_sample_pts: 40000 # 4,000
|
43 |
+
gs_mlp_network_config:
|
44 |
+
n_neurons: 512
|
45 |
+
n_hidden_layers: 2
|
46 |
+
activation: silu
|
47 |
+
# gs_xyz_offset_max_step: 0.05625 # 1.8 / 32
|
48 |
+
# gs_clip_scaling: 0.2 # avoid too large Sphere
|
49 |
+
gs_xyz_offset_max_step: 1. # 1.8 / 32
|
50 |
+
gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
|
51 |
+
expr_param_dim: 100
|
52 |
+
shape_param_dim: 10
|
53 |
+
|
54 |
+
fix_opacity: False
|
55 |
+
fix_rotation: False
|
56 |
+
cano_pose_type: 1 # 0 means exavatar-pose 1 indicates REC-MV pose
|
57 |
+
|
58 |
+
dataset:
|
59 |
+
subsets:
|
60 |
+
- name: video_human_flame
|
61 |
+
root_dirs: "./train_data/ClothVideo"
|
62 |
+
meta_path:
|
63 |
+
train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
|
64 |
+
val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
|
65 |
+
sample_rate: 1.0
|
66 |
+
use_flame: True
|
67 |
+
src_head_size: 112
|
68 |
+
- name: video_human_flame_v2
|
69 |
+
root_dirs: "./train_data/ClothVideo"
|
70 |
+
meta_path:
|
71 |
+
train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
|
72 |
+
val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
|
73 |
+
sample_rate: 1.0
|
74 |
+
use_flame: True
|
75 |
+
src_head_size: 112
|
76 |
+
sample_side_views: 5
|
77 |
+
source_image_res: 1024
|
78 |
+
src_head_size: 112
|
79 |
+
render_image:
|
80 |
+
low: 512
|
81 |
+
high: 512
|
82 |
+
region: null
|
83 |
+
num_train_workers: 4
|
84 |
+
multiply: 16 # dino features
|
85 |
+
num_val_workers: 2
|
86 |
+
pin_mem: true
|
87 |
+
repeat_num: 1
|
88 |
+
|
89 |
+
train:
|
90 |
+
mixed_precision: bf16 # REPLACE THIS BASED ON GPU TYPE
|
91 |
+
find_unused_parameters: false
|
92 |
+
loss_func:
|
93 |
+
pixel_loss: l1 # L1 or MSE
|
94 |
+
ball_loss:
|
95 |
+
type: heuristic # heuristic ball_loss
|
96 |
+
group:
|
97 |
+
head: 1.
|
98 |
+
lower_body: 100.
|
99 |
+
upper_body: 1000.
|
100 |
+
hands: 10000.
|
101 |
+
offset_loss:
|
102 |
+
type: classical # heuristic ball_loss
|
103 |
+
group:
|
104 |
+
head: 1.
|
105 |
+
lower_body: 1.
|
106 |
+
upper_body: 100.
|
107 |
+
hands: 1000.
|
108 |
+
loss:
|
109 |
+
pixel_weight: 0.0
|
110 |
+
masked_pixel_weight: 1.0
|
111 |
+
masked_head_weight: 0.0
|
112 |
+
perceptual_weight: 1.0
|
113 |
+
# tv_weight: 5e-4
|
114 |
+
tv_weight: -1
|
115 |
+
mask_weight: 1.0
|
116 |
+
face_id_weight: 0.05
|
117 |
+
asap_weight: 10.0 # ball loss
|
118 |
+
acap_weight: 1000.0 # offset loss
|
119 |
+
optim:
|
120 |
+
lr: 4e-5
|
121 |
+
weight_decay: 0.05
|
122 |
+
beta1: 0.9
|
123 |
+
beta2: 0.95
|
124 |
+
clip_grad_norm: 0.1 # diffusion model
|
125 |
+
scheduler:
|
126 |
+
type: cosine
|
127 |
+
warmup_real_iters: 0
|
128 |
+
batch_size: 4 # REPLACE THIS (PER GPU)
|
129 |
+
accum_steps: 1 # REPLACE THIS
|
130 |
+
epochs: 60 # REPLACE THIS
|
131 |
+
debug_global_steps: null
|
132 |
+
|
133 |
+
val:
|
134 |
+
batch_size: 2
|
135 |
+
global_step_period: 1000
|
136 |
+
debug_batches: 10
|
137 |
+
|
138 |
+
saver:
|
139 |
+
auto_resume: True
|
140 |
+
load_model: None
|
141 |
+
checkpoint_root: ./exps/checkpoints
|
142 |
+
checkpoint_global_steps: 1000
|
143 |
+
checkpoint_keep_level: 60
|
144 |
+
|
145 |
+
logger:
|
146 |
+
stream_level: WARNING
|
147 |
+
log_level: INFO
|
148 |
+
log_root: ./exps/logs
|
149 |
+
tracker_root: ./exps/trackers
|
150 |
+
enable_profiler: false
|
151 |
+
trackers:
|
152 |
+
- tensorboard
|
153 |
+
image_monitor:
|
154 |
+
train_global_steps: 100
|
155 |
+
samples_per_log: 4
|
156 |
+
|
157 |
+
compile:
|
158 |
+
suppress_errors: true
|
159 |
+
print_specializations: true
|
160 |
+
disable: true
|