QZFantasies commited on
Commit
139824f
·
1 Parent(s): b1acb64

add config

Browse files
configs/accelerate-train-1gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: bf16
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate-train-deepspeed.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ deepspeed_config:
4
+ gradient_accumulation_steps: 1
5
+ gradient_clipping: 1.0
6
+ offload_optimizer_device: none
7
+ offload_param_device: none
8
+ zero3_init_flag: false
9
+ zero_stage: 2
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ enable_cpu_affinity: false
13
+ machine_rank: 0
14
+ main_training_function: main
15
+ mixed_precision: bf16
16
+ num_machines: 1
17
+ num_processes: 8
18
+ rdzv_backend: static
19
+ same_network: true
20
+ tpu_env: []
21
+ tpu_use_cluster: false
22
+ tpu_use_sudo: false
23
+ use_cpu: false
configs/accelerate-train.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: bf16
9
+ num_machines: 1
10
+ num_processes: 8
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/infer-gradio.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ source_size: 336
2
+ render_size: 288
3
+ render_views: 100
4
+ render_fps: 25
5
+ frame_size: 2
6
+ mesh_size: 384
7
+ mesh_thres: 3.0
configs/inference/human-lrm-1B.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LHM-1B
2
+ experiment:
3
+ type: lrm
4
+ seed: 42
5
+ parent: video_human_benchmark
6
+ child: human-lrm-1B
7
+ model:
8
+ # image encoder
9
+ model_name: SapDinoLRMBHSD3_5
10
+ encoder_type: dinov2_fusion
11
+ encoder_model_name: "dinov2_vitl14_reg"
12
+ encoder_feat_dim: 1024 # dinov2 embeding size 1024
13
+ encoder_freeze: False
14
+
15
+ fine_encoder_type: sapiens
16
+ fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2" # sapiens pretrained model path
17
+ fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
18
+ fine_encoder_freeze: True
19
+
20
+
21
+ use_face_id: True
22
+
23
+ # points embeddings
24
+ # num_pcl: 10240
25
+ latent_query_points_type: "e2e_smplx_sub1"
26
+ pcl_dim: 1024
27
+
28
+ facesr: True
29
+
30
+
31
+ # transformer
32
+ # # camera_embed_dim: 1024
33
+ # transformer_dim: 512
34
+ # transformer_layers: 12
35
+ # transformer_heads: 8
36
+
37
+ transformer_type: "sd3_mm_bh_cond" # multi-modal attention.
38
+ transformer_heads: 16 # 30
39
+ transformer_dim: 1024 # 30 * 64=1920
40
+ transformer_layers: 15 # 30
41
+ tf_grad_ckpt: true
42
+ encoder_grad_ckpt: true
43
+
44
+ # for gs renderer
45
+ human_model_path: "./pretrained_models/human_model_files"
46
+ smplx_subdivide_num: 1
47
+ smplx_type: "smplx_2"
48
+ gs_query_dim: 1024
49
+ gs_use_rgb: True
50
+ gs_sh: 3
51
+ dense_sample_pts: 40000 # 4,000
52
+ gs_mlp_network_config:
53
+ n_neurons: 512
54
+ n_hidden_layers: 2
55
+ activation: silu
56
+ # gs_xyz_offset_max_step: 0.05625 # 1.8 / 32
57
+ # gs_clip_scaling: 0.2 # avoid too large Sphere
58
+ gs_xyz_offset_max_step: 1. # 1.8 / 32
59
+ gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
60
+ expr_param_dim: 100
61
+ shape_param_dim: 10
62
+
63
+ fix_opacity: False
64
+ fix_rotation: False
65
+ cano_pose_type: 1 # 0 means exavatar-pose 1 indicates REC-MV pose
66
+
67
+ dataset:
68
+ subsets:
69
+ - name: video_human_flame
70
+ root_dirs: "./train_data/ClothVideo"
71
+ meta_path:
72
+ train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
73
+ val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
74
+ sample_rate: 1.0
75
+ use_flame: True
76
+ src_head_size: 112
77
+ - name: video_human_flame_v2
78
+ root_dirs: "./train_data/ClothVideo"
79
+ meta_path:
80
+ train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
81
+ val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
82
+ sample_rate: 1.0
83
+ use_flame: True
84
+ src_head_size: 112
85
+ sample_side_views: 5
86
+ source_image_res: 1024
87
+ src_head_size: 112
88
+ render_image:
89
+ low: 512
90
+ high: 512
91
+ region: null
92
+ num_train_workers: 4
93
+ multiply: 16 # dino features
94
+ num_val_workers: 2
95
+ pin_mem: true
96
+ repeat_num: 1
97
+
98
+ train:
99
+ mixed_precision: bf16 # REPLACE THIS BASED ON GPU TYPE
100
+ find_unused_parameters: false
101
+ loss_func:
102
+ pixel_loss: l1 # L1 or MSE
103
+ ball_loss:
104
+ type: heuristic # heuristic ball_loss
105
+ group:
106
+ head: 1.
107
+ lower_body: 100.
108
+ upper_body: 1000.
109
+ hands: 10000.
110
+ offset_loss:
111
+ type: classical # heuristic ball_loss
112
+ group:
113
+ head: 1.
114
+ lower_body: 1.
115
+ upper_body: 100.
116
+ hands: 1000.
117
+ loss:
118
+ pixel_weight: 0.0
119
+ masked_pixel_weight: 1.0
120
+ masked_head_weight: 0.0
121
+ perceptual_weight: 1.0
122
+ # tv_weight: 5e-4
123
+ tv_weight: -1
124
+ mask_weight: 1.0
125
+ face_id_weight: 0.05
126
+ asap_weight: 10.0 # ball loss
127
+ acap_weight: 1000.0 # offset loss
128
+ optim:
129
+ lr: 4e-5
130
+ weight_decay: 0.05
131
+ beta1: 0.9
132
+ beta2: 0.95
133
+ clip_grad_norm: 0.1 # diffusion model
134
+ scheduler:
135
+ type: cosine
136
+ warmup_real_iters: 0
137
+ batch_size: 2 # REPLACE THIS (PER GPU)
138
+ accum_steps: 1 # REPLACE THIS
139
+ epochs: 60 # REPLACE THIS
140
+ debug_global_steps: null
141
+
142
+ val:
143
+ batch_size: 2
144
+ global_step_period: 1000
145
+ debug_batches: 10
146
+
147
+ saver:
148
+ auto_resume: True
149
+ checkpoint_root: None
150
+ checkpoint_global_steps: 1000
151
+ checkpoint_keep_level: 60
152
+
153
+ logger:
154
+ stream_level: WARNING
155
+ log_level: INFO
156
+ log_root: ./exps/logs
157
+ tracker_root: ./exps/trackers
158
+ enable_profiler: false
159
+ trackers:
160
+ - tensorboard
161
+ image_monitor:
162
+ train_global_steps: 100
163
+ samples_per_log: 4
164
+
165
+ compile:
166
+ suppress_errors: true
167
+ print_specializations: true
168
+ disable: true
configs/inference/human-lrm-500M.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LHM-500M
2
+ experiment:
3
+ type: lrm
4
+ seed: 42
5
+ parent: video_human_benchmark
6
+ child: human-lrm-500M
7
+ model:
8
+ # image encoder
9
+ model_name: SapDinoLRMBHSD3_5
10
+ encoder_type: dinov2_fusion
11
+ encoder_model_name: "dinov2_vitl14_reg"
12
+ encoder_feat_dim: 1024 # dinov2 embeding size 1024
13
+ encoder_freeze: False
14
+
15
+ fine_encoder_type: sapiens
16
+ fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2" # sapiens pretrained model path
17
+ fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
18
+ fine_encoder_freeze: True
19
+
20
+ use_face_id: True
21
+
22
+ # points embeddings
23
+ # num_pcl: 10240
24
+ latent_query_points_type: "e2e_smplx_sub1"
25
+ pcl_dim: 1024
26
+ facesr: True
27
+
28
+ transformer_type: "sd3_mm_bh_cond" # multi-modal BH attention.
29
+ transformer_heads: 16 # 30
30
+ transformer_dim: 1024 # 30 * 64=1920
31
+ transformer_layers: 5 # 30
32
+ tf_grad_ckpt: true
33
+ encoder_grad_ckpt: true
34
+
35
+ # for gs renderer
36
+ human_model_path: "./pretrained_models/human_model_files"
37
+ smplx_subdivide_num: 1
38
+ smplx_type: "smplx_2"
39
+ gs_query_dim: 1024
40
+ gs_use_rgb: True
41
+ gs_sh: 3
42
+ dense_sample_pts: 40000 # 4,000
43
+ gs_mlp_network_config:
44
+ n_neurons: 512
45
+ n_hidden_layers: 2
46
+ activation: silu
47
+ # gs_xyz_offset_max_step: 0.05625 # 1.8 / 32
48
+ # gs_clip_scaling: 0.2 # avoid too large Sphere
49
+ gs_xyz_offset_max_step: 1. # 1.8 / 32
50
+ gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
51
+ expr_param_dim: 100
52
+ shape_param_dim: 10
53
+
54
+ fix_opacity: False
55
+ fix_rotation: False
56
+ cano_pose_type: 1 # 0 means exavatar-pose 1 indicates REC-MV pose
57
+
58
+ dataset:
59
+ subsets:
60
+ - name: video_human_flame
61
+ root_dirs: "./train_data/ClothVideo"
62
+ meta_path:
63
+ train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
64
+ val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
65
+ sample_rate: 1.0
66
+ use_flame: True
67
+ src_head_size: 112
68
+ - name: video_human_flame_v2
69
+ root_dirs: "./train_data/ClothVideo"
70
+ meta_path:
71
+ train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
72
+ val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
73
+ sample_rate: 1.0
74
+ use_flame: True
75
+ src_head_size: 112
76
+ sample_side_views: 5
77
+ source_image_res: 1024
78
+ src_head_size: 112
79
+ render_image:
80
+ low: 512
81
+ high: 512
82
+ region: null
83
+ num_train_workers: 4
84
+ multiply: 16 # dino features
85
+ num_val_workers: 2
86
+ pin_mem: true
87
+ repeat_num: 1
88
+
89
+ train:
90
+ mixed_precision: bf16 # REPLACE THIS BASED ON GPU TYPE
91
+ find_unused_parameters: false
92
+ loss_func:
93
+ pixel_loss: l1 # L1 or MSE
94
+ ball_loss:
95
+ type: heuristic # heuristic ball_loss
96
+ group:
97
+ head: 1.
98
+ lower_body: 100.
99
+ upper_body: 1000.
100
+ hands: 10000.
101
+ offset_loss:
102
+ type: classical # heuristic ball_loss
103
+ group:
104
+ head: 1.
105
+ lower_body: 1.
106
+ upper_body: 100.
107
+ hands: 1000.
108
+ loss:
109
+ pixel_weight: 0.0
110
+ masked_pixel_weight: 1.0
111
+ masked_head_weight: 0.0
112
+ perceptual_weight: 1.0
113
+ # tv_weight: 5e-4
114
+ tv_weight: -1
115
+ mask_weight: 1.0
116
+ face_id_weight: 0.05
117
+ asap_weight: 10.0 # ball loss
118
+ acap_weight: 1000.0 # offset loss
119
+ optim:
120
+ lr: 4e-5
121
+ weight_decay: 0.05
122
+ beta1: 0.9
123
+ beta2: 0.95
124
+ clip_grad_norm: 0.1 # diffusion model
125
+ scheduler:
126
+ type: cosine
127
+ warmup_real_iters: 0
128
+ batch_size: 4 # REPLACE THIS (PER GPU)
129
+ accum_steps: 1 # REPLACE THIS
130
+ epochs: 60 # REPLACE THIS
131
+ debug_global_steps: null
132
+
133
+ val:
134
+ batch_size: 2
135
+ global_step_period: 1000
136
+ debug_batches: 10
137
+
138
+ saver:
139
+ auto_resume: True
140
+ load_model: None
141
+ checkpoint_root: ./exps/checkpoints
142
+ checkpoint_global_steps: 1000
143
+ checkpoint_keep_level: 60
144
+
145
+ logger:
146
+ stream_level: WARNING
147
+ log_level: INFO
148
+ log_root: ./exps/logs
149
+ tracker_root: ./exps/trackers
150
+ enable_profiler: false
151
+ trackers:
152
+ - tensorboard
153
+ image_monitor:
154
+ train_global_steps: 100
155
+ samples_per_log: 4
156
+
157
+ compile:
158
+ suppress_errors: true
159
+ print_specializations: true
160
+ disable: true