Spaces:

DyrusQZ
/

LHM

Running on A10G

App Files Files Community

QZFantasies commited on 2 days ago

Commit

139824f

1 Parent(s): b1acb64

add config

Browse files

Files changed (6) hide show

configs/accelerate-train-1gpu.yaml +16 -0
configs/accelerate-train-deepspeed.yaml +23 -0
configs/accelerate-train.yaml +16 -0
configs/infer-gradio.yaml +7 -0
configs/inference/human-lrm-1B.yaml +168 -0
configs/inference/human-lrm-500M.yaml +160 -0

configs/accelerate-train-1gpu.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/accelerate-train-deepspeed.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/accelerate-train.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/infer-gradio.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+source_size: 336
+render_size: 288
+render_views: 100
+render_fps: 25
+frame_size: 2
+mesh_size: 384
+mesh_thres: 3.0

configs/inference/human-lrm-1B.yaml ADDED Viewed

	@@ -0,0 +1,168 @@

+# LHM-1B
+experiment:
+    type: lrm
+    seed: 42
+    parent: video_human_benchmark
+    child: human-lrm-1B
+model:
+    # image encoder
+    model_name: SapDinoLRMBHSD3_5
+    encoder_type: dinov2_fusion
+    encoder_model_name: "dinov2_vitl14_reg"
+    encoder_feat_dim: 1024  # dinov2 embeding size 1024
+    encoder_freeze: False
+    fine_encoder_type: sapiens
+    fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2"  # sapiens pretrained model path
+    fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
+    fine_encoder_freeze: True
+    use_face_id: True
+    # points embeddings
+    # num_pcl: 10240
+    latent_query_points_type: "e2e_smplx_sub1"
+    pcl_dim: 1024
+    facesr: True
+    # transformer
+    # # camera_embed_dim: 1024
+    # transformer_dim: 512
+    # transformer_layers: 12
+    # transformer_heads: 8
+    transformer_type: "sd3_mm_bh_cond"  # multi-modal attention.
+    transformer_heads: 16  # 30
+    transformer_dim: 1024  # 30 * 64=1920
+    transformer_layers: 15 # 30
+    tf_grad_ckpt: true
+    encoder_grad_ckpt: true
+    # for gs renderer
+    human_model_path: "./pretrained_models/human_model_files"
+    smplx_subdivide_num: 1
+    smplx_type: "smplx_2"
+    gs_query_dim: 1024
+    gs_use_rgb: True
+    gs_sh: 3
+    dense_sample_pts: 40000  # 4,000
+    gs_mlp_network_config:
+        n_neurons: 512
+        n_hidden_layers: 2
+        activation: silu
+    # gs_xyz_offset_max_step: 0.05625  # 1.8 / 32
+    # gs_clip_scaling: 0.2  # avoid too large Sphere
+    gs_xyz_offset_max_step: 1.  # 1.8 / 32
+    gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
+    expr_param_dim: 100
+    shape_param_dim: 10
+    fix_opacity: False
+    fix_rotation: False
+    cano_pose_type: 1  # 0 means exavatar-pose 1 indicates REC-MV pose
+dataset:
+    subsets:
+        -   name: video_human_flame
+            root_dirs: "./train_data/ClothVideo"
+            meta_path:
+                train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
+                val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
+            sample_rate: 1.0
+            use_flame: True
+            src_head_size: 112
+        -   name: video_human_flame_v2
+            root_dirs: "./train_data/ClothVideo"
+            meta_path:
+                train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
+                val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
+            sample_rate: 1.0
+            use_flame: True
+            src_head_size: 112
+    sample_side_views: 5
+    source_image_res: 1024
+    src_head_size: 112
+    render_image:
+        low: 512
+        high: 512
+        region: null
+    num_train_workers: 4
+    multiply: 16  # dino features
+    num_val_workers: 2
+    pin_mem: true
+    repeat_num: 1
+train:
+    mixed_precision: bf16  # REPLACE THIS BASED ON GPU TYPE
+    find_unused_parameters: false
+    loss_func:
+        pixel_loss: l1  # L1 or MSE
+        ball_loss:
+            type: heuristic  # heuristic ball_loss
+            group:
+                head: 1.
+                lower_body: 100.
+                upper_body: 1000.
+                hands: 10000.
+        offset_loss:
+            type: classical # heuristic ball_loss
+            group:
+                head: 1.
+                lower_body: 1.
+                upper_body: 100.
+                hands: 1000.
+    loss:
+        pixel_weight: 0.0
+        masked_pixel_weight: 1.0
+        masked_head_weight: 0.0
+        perceptual_weight: 1.0
+        # tv_weight: 5e-4
+        tv_weight: -1
+        mask_weight: 1.0
+        face_id_weight: 0.05
+        asap_weight: 10.0  # ball loss
+        acap_weight: 1000.0  # offset loss
+    optim:
+        lr: 4e-5
+        weight_decay: 0.05
+        beta1: 0.9
+        beta2: 0.95
+        clip_grad_norm: 0.1  # diffusion model
+    scheduler:
+        type: cosine
+        warmup_real_iters: 0
+    batch_size: 2  # REPLACE THIS (PER GPU)
+    accum_steps: 1  # REPLACE THIS
+    epochs: 60  # REPLACE THIS
+    debug_global_steps: null
+val:
+    batch_size: 2
+    global_step_period: 1000
+    debug_batches: 10
+saver:
+    auto_resume: True
+    checkpoint_root: None
+    checkpoint_global_steps: 1000
+    checkpoint_keep_level: 60
+logger:
+    stream_level: WARNING
+    log_level: INFO
+    log_root: ./exps/logs
+    tracker_root: ./exps/trackers
+    enable_profiler: false
+    trackers:
+        - tensorboard
+    image_monitor:
+        train_global_steps: 100
+        samples_per_log: 4
+compile:
+    suppress_errors: true
+    print_specializations: true
+    disable: true

configs/inference/human-lrm-500M.yaml ADDED Viewed

	@@ -0,0 +1,160 @@

+# LHM-500M
+experiment:
+    type: lrm
+    seed: 42
+    parent: video_human_benchmark
+    child: human-lrm-500M
+model:
+    # image encoder
+    model_name: SapDinoLRMBHSD3_5
+    encoder_type: dinov2_fusion
+    encoder_model_name: "dinov2_vitl14_reg"
+    encoder_feat_dim: 1024  # dinov2 embeding size 1024
+    encoder_freeze: False
+    fine_encoder_type: sapiens
+    fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2"  # sapiens pretrained model path
+    fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
+    fine_encoder_freeze: True
+    use_face_id: True
+    # points embeddings
+    # num_pcl: 10240
+    latent_query_points_type: "e2e_smplx_sub1"
+    pcl_dim: 1024
+    facesr: True
+    transformer_type: "sd3_mm_bh_cond"  # multi-modal BH attention.
+    transformer_heads: 16  # 30
+    transformer_dim: 1024  # 30 * 64=1920
+    transformer_layers: 5 # 30
+    tf_grad_ckpt: true
+    encoder_grad_ckpt: true
+    # for gs renderer
+    human_model_path: "./pretrained_models/human_model_files"
+    smplx_subdivide_num: 1
+    smplx_type: "smplx_2"
+    gs_query_dim: 1024
+    gs_use_rgb: True
+    gs_sh: 3
+    dense_sample_pts: 40000  # 4,000
+    gs_mlp_network_config:
+        n_neurons: 512
+        n_hidden_layers: 2
+        activation: silu
+    # gs_xyz_offset_max_step: 0.05625  # 1.8 / 32
+    # gs_clip_scaling: 0.2  # avoid too large Sphere
+    gs_xyz_offset_max_step: 1.  # 1.8 / 32
+    gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end]
+    expr_param_dim: 100
+    shape_param_dim: 10
+    fix_opacity: False
+    fix_rotation: False
+    cano_pose_type: 1  # 0 means exavatar-pose 1 indicates REC-MV pose
+dataset:
+    subsets:
+        -   name: video_human_flame
+            root_dirs: "./train_data/ClothVideo"
+            meta_path:
+                train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
+                val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
+            sample_rate: 1.0
+            use_flame: True
+            src_head_size: 112
+        -   name: video_human_flame_v2
+            root_dirs: "./train_data/ClothVideo"
+            meta_path:
+                train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
+                val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
+            sample_rate: 1.0
+            use_flame: True
+            src_head_size: 112
+    sample_side_views: 5
+    source_image_res: 1024
+    src_head_size: 112
+    render_image:
+        low: 512
+        high: 512
+        region: null
+    num_train_workers: 4
+    multiply: 16  # dino features
+    num_val_workers: 2
+    pin_mem: true
+    repeat_num: 1
+train:
+    mixed_precision: bf16  # REPLACE THIS BASED ON GPU TYPE
+    find_unused_parameters: false
+    loss_func:
+        pixel_loss: l1  # L1 or MSE
+        ball_loss:
+            type: heuristic  # heuristic ball_loss
+            group:
+                head: 1.
+                lower_body: 100.
+                upper_body: 1000.
+                hands: 10000.
+        offset_loss:
+            type: classical # heuristic ball_loss
+            group:
+                head: 1.
+                lower_body: 1.
+                upper_body: 100.
+                hands: 1000.
+    loss:
+        pixel_weight: 0.0
+        masked_pixel_weight: 1.0
+        masked_head_weight: 0.0
+        perceptual_weight: 1.0
+        # tv_weight: 5e-4
+        tv_weight: -1
+        mask_weight: 1.0
+        face_id_weight: 0.05
+        asap_weight: 10.0  # ball loss
+        acap_weight: 1000.0  # offset loss
+    optim:
+        lr: 4e-5
+        weight_decay: 0.05
+        beta1: 0.9
+        beta2: 0.95
+        clip_grad_norm: 0.1  # diffusion model
+    scheduler:
+        type: cosine
+        warmup_real_iters: 0
+    batch_size: 4  # REPLACE THIS (PER GPU)
+    accum_steps: 1  # REPLACE THIS
+    epochs: 60  # REPLACE THIS
+    debug_global_steps: null
+val:
+    batch_size: 2
+    global_step_period: 1000
+    debug_batches: 10
+saver:
+    auto_resume: True
+    load_model: None
+    checkpoint_root: ./exps/checkpoints
+    checkpoint_global_steps: 1000
+    checkpoint_keep_level: 60
+logger:
+    stream_level: WARNING
+    log_level: INFO
+    log_root: ./exps/logs
+    tracker_root: ./exps/trackers
+    enable_profiler: false
+    trackers:
+        - tensorboard
+    image_monitor:
+        train_global_steps: 100
+        samples_per_log: 4
+compile:
+    suppress_errors: true
+    print_specializations: true
+    disable: true