yuanxuewei commited on
Commit
16dafbc
·
verified ·
1 Parent(s): dd53463

Add files using upload-large-folder tool

Browse files
checkpoints/steps_30000_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb1ab1785ec9ca107b1e211ba9d6e390396c475921f6b834f4d9371b137321e
3
+ size 8935435793
config.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "0911_libero_object_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k",
3
+ "run_root_dir": "./playground/Checkpoints",
4
+ "seed": 42,
5
+ "trackers": [
6
+ "jsonl",
7
+ "wandb"
8
+ ],
9
+ "wandb_entity": "michaelyu-1101-fudanuniversity",
10
+ "wandb_project": "Internvla",
11
+ "is_debug": false,
12
+ "framework": {
13
+ "framework_py": "DinoQFormerACT",
14
+ "qwenvl": {
15
+ "base_vlm": "/mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000/",
16
+ "attn_implementation": "flash_attention_2",
17
+ "vl_hidden_dim": 2048
18
+ },
19
+ "dino": {
20
+ "dino_backbone": "dinov2_vitl14"
21
+ },
22
+ "layer_qformer": {
23
+ "qformer_end_layer": 37,
24
+ "qformer_start_layer": 36,
25
+ "num_query_tokens": 64,
26
+ "grad_scale": 0.5
27
+ },
28
+ "action_model": {
29
+ "action_model_type": "DiT-B",
30
+ "action_hidden_dim": 768,
31
+ "action_dim": 7,
32
+ "use_ema": false,
33
+ "future_action_window_size": 7,
34
+ "past_action_window_size": 0,
35
+ "repeated_diffusion_steps": 8
36
+ },
37
+ "reduce_in_full_precision": true
38
+ },
39
+ "datasets": {
40
+ "vlm_data": {
41
+ "dataformat": "llava_json",
42
+ "dataset_use": "asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en",
43
+ "eval_dataset": "aokvqa_cauldron_llava_format",
44
+ "data_flatten": false,
45
+ "base_interval": 2,
46
+ "max_pixels": 50176,
47
+ "min_pixels": 784,
48
+ "fix_image_size": [
49
+ 224,
50
+ 224
51
+ ],
52
+ "model_max_length": 1024,
53
+ "model_type": "qwen2.5vl",
54
+ "per_device_batch_size": 4
55
+ },
56
+ "vla_data": {
57
+ "dataset_py": "lerobot_libero",
58
+ "data_root_dir": "playground/Datasets/LEROBOT_LIBERO_DATA",
59
+ "data_mix": "libero_object",
60
+ "action_type": "delta_qpos",
61
+ "CoT_prompt": "Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format.",
62
+ "CoT_answer": "bbox",
63
+ "default_image_resolution": [
64
+ 3,
65
+ 224,
66
+ 224
67
+ ],
68
+ "per_device_batch_size": 16,
69
+ "load_all_data_for_training": true,
70
+ "obs": [
71
+ "image_0"
72
+ ]
73
+ }
74
+ },
75
+ "trainer": {
76
+ "epochs": 100,
77
+ "max_train_steps": 100000,
78
+ "num_warmup_steps": 5000,
79
+ "save_interval": 10000,
80
+ "eval_interval": 1000,
81
+ "learning_rate": {
82
+ "base": 2.5e-05
83
+ },
84
+ "lr_scheduler_type": "cosine_with_min_lr",
85
+ "scheduler_specific_kwargs": {
86
+ "min_lr": 1e-06
87
+ },
88
+ "freeze_modules": "",
89
+ "loss_scale": {
90
+ "vla": 1.0,
91
+ "vlm": 0.1
92
+ },
93
+ "max_grad_norm": 1.0,
94
+ "warmup_ratio": 0.1,
95
+ "weight_decay": 0.0,
96
+ "logging_frequency": 10,
97
+ "gradient_clipping": 1.0,
98
+ "gradient_accumulation_steps": 1,
99
+ "optimizer": {
100
+ "name": "AdamW",
101
+ "betas": [
102
+ 0.9,
103
+ 0.95
104
+ ],
105
+ "eps": 1e-08,
106
+ "weight_decay": 1e-08
107
+ },
108
+ "is_resume": false,
109
+ "resume_epoch": null,
110
+ "resume_step": null,
111
+ "enable_gradient_checkpointing": true,
112
+ "enable_mixed_precision_training": true
113
+ },
114
+ "output_dir": "./playground/Checkpoints/0911_libero_object_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k"
115
+ }
config.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: 0911_libero_object_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k
2
+ run_root_dir: ./playground/Checkpoints
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_entity: michaelyu-1101-fudanuniversity
8
+ wandb_project: Internvla
9
+ is_debug: false
10
+ framework:
11
+ framework_py: DinoQFormerACT
12
+ qwenvl:
13
+ base_vlm: /mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000/
14
+ attn_implementation: flash_attention_2
15
+ vl_hidden_dim: 2048
16
+ dino:
17
+ dino_backbone: dinov2_vitl14
18
+ layer_qformer:
19
+ qformer_end_layer: 37
20
+ qformer_start_layer: 36
21
+ num_query_tokens: 64
22
+ grad_scale: 0.5
23
+ action_model:
24
+ action_model_type: DiT-B
25
+ action_hidden_dim: 768
26
+ action_dim: 7
27
+ use_ema: false
28
+ future_action_window_size: 7
29
+ past_action_window_size: 0
30
+ repeated_diffusion_steps: 8
31
+ reduce_in_full_precision: true
32
+ datasets:
33
+ vlm_data:
34
+ dataformat: llava_json
35
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
36
+ eval_dataset: aokvqa_cauldron_llava_format
37
+ data_flatten: false
38
+ base_interval: 2
39
+ max_pixels: 50176
40
+ min_pixels: 784
41
+ fix_image_size:
42
+ - 224
43
+ - 224
44
+ model_max_length: 1024
45
+ model_type: qwen2.5vl
46
+ per_device_batch_size: 4
47
+ vla_data:
48
+ dataset_py: lerobot_libero
49
+ data_root_dir: playground/Datasets/LEROBOT_LIBERO_DATA
50
+ data_mix: libero_object
51
+ action_type: delta_qpos
52
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
53
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
54
+ CoT_answer: bbox
55
+ default_image_resolution:
56
+ - 3
57
+ - 224
58
+ - 224
59
+ per_device_batch_size: 16
60
+ load_all_data_for_training: true
61
+ obs:
62
+ - image_0
63
+ trainer:
64
+ epochs: 100
65
+ max_train_steps: 100000
66
+ num_warmup_steps: 5000
67
+ save_interval: 10000
68
+ eval_interval: 1000
69
+ learning_rate:
70
+ base: 2.5e-05
71
+ lr_scheduler_type: cosine_with_min_lr
72
+ scheduler_specific_kwargs:
73
+ min_lr: 1.0e-06
74
+ freeze_modules: ''
75
+ loss_scale:
76
+ vla: 1.0
77
+ vlm: 0.1
78
+ max_grad_norm: 1.0
79
+ warmup_ratio: 0.1
80
+ weight_decay: 0.0
81
+ logging_frequency: 10
82
+ gradient_clipping: 1.0
83
+ gradient_accumulation_steps: 1
84
+ optimizer:
85
+ name: AdamW
86
+ betas:
87
+ - 0.9
88
+ - 0.95
89
+ eps: 1.0e-08
90
+ weight_decay: 1.0e-08
91
+ is_resume: false
92
+ resume_epoch: null
93
+ resume_step: null
94
+ enable_gradient_checkpointing: true
95
+ enable_mixed_precision_training: true
96
+ output_dir: ./playground/Checkpoints/0911_libero_object_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k
dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.07096529006958008,
6
+ 0.13498851656913757,
7
+ -0.04601382836699486,
8
+ 0.00123520044144243,
9
+ 0.006998839322477579,
10
+ -0.015027612447738647,
11
+ 0.46428999304771423
12
+ ],
13
+ "std": [
14
+ 0.2681235373020172,
15
+ 0.43846824765205383,
16
+ 0.4474974274635315,
17
+ 0.024446550756692886,
18
+ 0.049355510622262955,
19
+ 0.042107198387384415,
20
+ 0.49879148602485657
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.8919642567634583,
25
+ 0.9375,
26
+ 0.17678570747375488,
27
+ 0.35035714507102966,
28
+ 0.1810714304447174,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.8839285969734192,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.15000000596046448,
36
+ -0.29035714268684387,
37
+ -0.32892856001853943,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.5383928418159485,
42
+ -0.8758928775787354,
43
+ -0.9375,
44
+ -0.06964285671710968,
45
+ -0.11678571254014969,
46
+ -0.15964286029338837,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.8464285731315613,
51
+ 0.84375,
52
+ 0.9375,
53
+ 0.08142857253551483,
54
+ 0.14892856776714325,
55
+ 0.0867857113480568,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ -0.02999030612409115,
71
+ -0.007947085425257683,
72
+ 0.20293472707271576,
73
+ 3.1086409091949463,
74
+ -0.21404768526554108,
75
+ -0.11307074874639511,
76
+ 0.029380427673459053,
77
+ -0.030556727200746536
78
+ ],
79
+ "std": [
80
+ 0.06694897264242172,
81
+ 0.17608462274074554,
82
+ 0.07807064801454544,
83
+ 0.08684843033552649,
84
+ 0.33540457487106323,
85
+ 0.20728276669979095,
86
+ 0.00956575945019722,
87
+ 0.009197483770549297
88
+ ],
89
+ "max": [
90
+ 0.14580604434013367,
91
+ 0.33216384053230286,
92
+ 0.3857804834842682,
93
+ 3.4003844261169434,
94
+ 0.7954911589622498,
95
+ 0.6642207503318787,
96
+ 0.04104341194033623,
97
+ -0.00018117300351150334
98
+ ],
99
+ "min": [
100
+ -0.1765444278717041,
101
+ -0.29457300901412964,
102
+ 0.008128180168569088,
103
+ 2.2890501022338867,
104
+ -1.883241891860962,
105
+ -1.0600427389144897,
106
+ 0.0006495157140307128,
107
+ -0.041782498359680176
108
+ ],
109
+ "q01": [
110
+ -0.14911890715360643,
111
+ -0.25978428691625594,
112
+ 0.009925739830359817,
113
+ 2.7545341420173646,
114
+ -1.3996034812927245,
115
+ -0.6867720144987106,
116
+ 0.008197814421728254,
117
+ -0.04015838988125324
118
+ ],
119
+ "q99": [
120
+ 0.09063626825809479,
121
+ 0.29066365867853167,
122
+ 0.3370887073874472,
123
+ 3.2611824750900267,
124
+ 0.32092821151018125,
125
+ 0.4037663781642913,
126
+ 0.039891827926039694,
127
+ -0.009106044843792932
128
+ ]
129
+ },
130
+ "num_transitions": 66984,
131
+ "num_trajectories": 454
132
+ }
133
+ }
run_lerobot_datasets.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export HF_HOME=/mnt/petrelfs/share/yejinhui/Models/huggingface_cache
2
+
3
+ export NCCL_SOCKET_IFNAME=bond0
4
+ export NCCL_IB_HCA=mlx5_2,mlx5_3
5
+
6
+ # 用于check save 的时候的通信
7
+ export NCCL_BLOCKING_WAIT=1
8
+ export NCCL_ASYNC_ERROR_HANDLING=1
9
+ export NCCL_TIMEOUT=1000 # 超时时间设为 1 小时(单位:秒)
10
+
11
+ cd /mnt/petrelfs/yujunqiu/code/vla-baseline/llavavla-00hf1
12
+
13
+ # MODEL_PATH=/mnt/petrelfs/yejinhui/Projects/llavavla/playground/Pretrained_models/Qwen2.5-VL-3B-Instruct # must be a local path, due to simpler will run in other where
14
+ # data_root_dir=./playground/Datasets/OXE_LEROBOT_DATASET
15
+ run_root_dir=./playground/Checkpoints
16
+ task_name=libero_object
17
+ run_id=0911_${task_name}_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_8_pretrained_vlm_20k
18
+
19
+
20
+ export WANDB_MODE=disabled
21
+
22
+ output_dir=${run_root_dir}/${run_id}
23
+ mkdir -p ${output_dir}
24
+ # mv this script to the output dir
25
+ cp $0 ${output_dir}/
26
+
27
+ # --pretrained_checkpoint ${MODEL_PATH} \
28
+ # export CUDA_VISIBLE_DEVICES=4,5,6,7
29
+
30
+ # --datasets.vla_data.data_mix libero_goal \
31
+ # --framework.framework_py qwenpi \
32
+
33
+ DEBUG=False
34
+ # DEBUG=True
35
+
36
+ if [ "$DEBUG" = True ]; then
37
+ num_processes=1
38
+ run_id=debug
39
+ else
40
+ num_processes=8
41
+ fi
42
+
43
+
44
+ accelerate launch \
45
+ --config_file scripts/run_scripts/deepspeed_zero2.yaml \
46
+ --num_processes ${num_processes} \
47
+ llavavla/training/train_qwenvla.py \
48
+ --config_yaml ./llavavla/config/lerobot_data/qwenvla_cotrain_libero.yaml \
49
+ --datasets.vla_data.per_device_batch_size 16 \
50
+ --datasets.vla_data.data_mix ${task_name} \
51
+ --framework.action_model.future_action_window_size 7 \
52
+ --trainer.max_train_steps 100_000 \
53
+ --trainer.save_interval 10_000 \
54
+ --run_root_dir ${run_root_dir} \
55
+ --run_id ${run_id} \
56
+ --wandb_project Internvla \
57
+ --wandb_entity michaelyu-1101-fudanuniversity \
58
+ --is_debug ${DEBUG} \
59
+ --framework.qwenvl.base_vlm /mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000/
60
+
61
+ # --framework.qwenvl.base_vlm ${MODEL_PATH} \
62
+ # --data_root_dir ${data_root_dir} \
63
+
64
+ # --is_debug True
summary.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}
4
+ {"steps": 40000}
5
+ {"steps": 50000}
6
+ {"steps": 60000}
7
+ {"steps": 70000}
8
+ {"steps": 80000}
9
+ {"steps": 90000}
10
+ {"steps": 100000}