Timsty commited on
Commit
dae2aa6
·
verified ·
1 Parent(s): 84efef0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -168,3 +168,5 @@ videos/chunk-000/wrist_image_left/episode_000076.mp4 filter=lfs diff=lfs merge=l
168
  videos/chunk-000/wrist_image_left/episode_000077.mp4 filter=lfs diff=lfs merge=lfs -text
169
  videos/chunk-000/wrist_image_left/episode_000078.mp4 filter=lfs diff=lfs merge=lfs -text
170
  videos/chunk-000/wrist_image_left/episode_000079.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
168
  videos/chunk-000/wrist_image_left/episode_000077.mp4 filter=lfs diff=lfs merge=lfs -text
169
  videos/chunk-000/wrist_image_left/episode_000078.mp4 filter=lfs diff=lfs merge=lfs -text
170
  videos/chunk-000/wrist_image_left/episode_000079.mp4 filter=lfs diff=lfs merge=lfs -text
171
+ final_model/pytorch_model.pt filter=lfs diff=lfs merge=lfs -text
172
+ wandb/wandb/run-20260414_022133-bxpz7wpp/run-bxpz7wpp.wandb filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "0413_QwenLatent_realworld_actionstate_10k",
3
+ "run_root_dir": "./runs",
4
+ "seed": 42,
5
+ "trackers": [
6
+ "jsonl",
7
+ "wandb"
8
+ ],
9
+ "wandb_entity": "timsty",
10
+ "wandb_project": "vla_jepa",
11
+ "is_debug": false,
12
+ "framework": {
13
+ "name": "QwenLatent",
14
+ "qwenvl": {
15
+ "base_vlm": "/mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct",
16
+ "attn_implementation": "flash_attention_2",
17
+ "vl_hidden_dim": 2048,
18
+ "num_data_tokens": 32
19
+ },
20
+ "action_model": {
21
+ "ckpt_path": "/mnt/data/fangyu/code/reward_new/runs/0303_Action_9tasks_actionstate_fixchunk15/final_model/pytorch_model.pt",
22
+ "action_size": 37,
23
+ "state_size": 74,
24
+ "use_state": "${datasets.vla_data.state_use_action_chunk}",
25
+ "hidden_size": 1024,
26
+ "intermediate_size": 3072,
27
+ "dataset_vocab_size": 256,
28
+ "num_data_tokens": 32,
29
+ "num_t_samples": 4,
30
+ "min_action_len": 5,
31
+ "num_encoder_layers": 28,
32
+ "num_decoder_layers": 28,
33
+ "num_attention_heads": 16,
34
+ "num_key_value_heads": 8,
35
+ "head_dim": 128,
36
+ "max_position_embeddings": 2048,
37
+ "max_action_chunk_size": 50,
38
+ "rms_norm_eps": 1e-06,
39
+ "attention_dropout": 0.0,
40
+ "use_vae_reparameterization": false,
41
+ "use_ema": false,
42
+ "chunk_size": "${datasets.vla_data.chunk_size}",
43
+ "loss_mode": "full",
44
+ "qwen3_pretrained_name_or_path": "/mnt/data/fangyu/model/Qwen/Qwen3-0.6B"
45
+ }
46
+ },
47
+ "datasets": {
48
+ "vla_data": {
49
+ "dataset_py": "lerobot_datasets",
50
+ "data_root_dir": "/mnt/data/fangyu/dataset/IPEC-COMMUNITY",
51
+ "data_mix": "real_world_4tasks",
52
+ "CoT_prompt": "Task: {instruction}. What are the next 15 actions to take?",
53
+ "default_image_resolution": [
54
+ 3,
55
+ 224,
56
+ 224
57
+ ],
58
+ "per_device_batch_size": 32,
59
+ "load_all_data_for_training": true,
60
+ "obs": [
61
+ "image_0"
62
+ ],
63
+ "image_size": [
64
+ 224,
65
+ 224
66
+ ],
67
+ "video_backend": "torchcodec",
68
+ "load_video": true,
69
+ "chunk_size": 15,
70
+ "state_use_action_chunk": true,
71
+ "num_history_steps": 0,
72
+ "include_state": "${datasets.vla_data.state_use_action_chunk}"
73
+ }
74
+ },
75
+ "trainer": {
76
+ "epochs": 100,
77
+ "max_train_steps": 10000,
78
+ "num_warmup_steps": 1000,
79
+ "num_stable_steps": 0,
80
+ "mode": "freeze_action_encoder_decay_aux_loss",
81
+ "loss_weights_decay_steps": 1000,
82
+ "save_interval": 5000,
83
+ "eval_interval": 50,
84
+ "max_checkpoints_to_keep": 20,
85
+ "learning_rate": {
86
+ "base": 2.5e-05,
87
+ "qwen_vl_interface": 2.5e-05,
88
+ "action_model": 2.5e-05
89
+ },
90
+ "lr_scheduler_type": "warmup_stable_cosine",
91
+ "scheduler_specific_kwargs": {
92
+ "min_lr_ratio": 0.001
93
+ },
94
+ "freeze_modules": "",
95
+ "loss_scale": {
96
+ "align_loss": 1.0,
97
+ "recon_loss": 1.0,
98
+ "predict_loss": 1.0
99
+ },
100
+ "warmup_ratio": 0.1,
101
+ "weight_decay": 0.0,
102
+ "logging_frequency": 10,
103
+ "gradient_clipping": 5.0,
104
+ "gradient_accumulation_steps": 1,
105
+ "optimizer": {
106
+ "name": "AdamW",
107
+ "betas": [
108
+ 0.9,
109
+ 0.95
110
+ ],
111
+ "eps": 1e-08,
112
+ "weight_decay": 1e-08
113
+ },
114
+ "is_resume": false,
115
+ "resume_epoch": null,
116
+ "resume_step": null,
117
+ "enable_gradient_checkpointing": true,
118
+ "enable_mixed_precision_training": true
119
+ },
120
+ "output_dir": "./runs/0413_QwenLatent_realworld_actionstate_10k"
121
+ }
config.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: 0413_QwenLatent_realworld_actionstate_10k
2
+ run_root_dir: ./runs
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_entity: timsty
8
+ wandb_project: vla_jepa
9
+ is_debug: false
10
+ framework:
11
+ name: QwenLatent
12
+ qwenvl:
13
+ base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
14
+ attn_implementation: flash_attention_2
15
+ vl_hidden_dim: 2048
16
+ num_data_tokens: 32
17
+ action_model:
18
+ ckpt_path: /mnt/data/fangyu/code/reward_new/runs/0303_Action_9tasks_actionstate_fixchunk15/final_model/pytorch_model.pt
19
+ action_size: 37
20
+ state_size: 74
21
+ use_state: ${datasets.vla_data.state_use_action_chunk}
22
+ hidden_size: 1024
23
+ intermediate_size: 3072
24
+ dataset_vocab_size: 256
25
+ num_data_tokens: 32
26
+ num_t_samples: 4
27
+ min_action_len: 5
28
+ num_encoder_layers: 28
29
+ num_decoder_layers: 28
30
+ num_attention_heads: 16
31
+ num_key_value_heads: 8
32
+ head_dim: 128
33
+ max_position_embeddings: 2048
34
+ max_action_chunk_size: 50
35
+ rms_norm_eps: 1.0e-06
36
+ attention_dropout: 0.0
37
+ use_vae_reparameterization: false
38
+ use_ema: false
39
+ chunk_size: ${datasets.vla_data.chunk_size}
40
+ loss_mode: full
41
+ qwen3_pretrained_name_or_path: /mnt/data/fangyu/model/Qwen/Qwen3-0.6B
42
+ datasets:
43
+ vla_data:
44
+ dataset_py: lerobot_datasets
45
+ data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
46
+ data_mix: real_world_4tasks
47
+ CoT_prompt: 'Task: {instruction}. What are the next 15 actions to take?'
48
+ default_image_resolution:
49
+ - 3
50
+ - 224
51
+ - 224
52
+ per_device_batch_size: 32
53
+ load_all_data_for_training: true
54
+ obs:
55
+ - image_0
56
+ image_size:
57
+ - 224
58
+ - 224
59
+ video_backend: torchcodec
60
+ load_video: true
61
+ chunk_size: 15
62
+ state_use_action_chunk: true
63
+ num_history_steps: 0
64
+ include_state: ${datasets.vla_data.state_use_action_chunk}
65
+ trainer:
66
+ epochs: 100
67
+ max_train_steps: 10000
68
+ num_warmup_steps: 1000
69
+ num_stable_steps: 0
70
+ mode: freeze_action_encoder_decay_aux_loss
71
+ loss_weights_decay_steps: 1000
72
+ save_interval: 5000
73
+ eval_interval: 50
74
+ max_checkpoints_to_keep: 20
75
+ learning_rate:
76
+ base: 2.5e-05
77
+ qwen_vl_interface: 2.5e-05
78
+ action_model: 2.5e-05
79
+ lr_scheduler_type: warmup_stable_cosine
80
+ scheduler_specific_kwargs:
81
+ min_lr_ratio: 0.001
82
+ freeze_modules: ''
83
+ loss_scale:
84
+ align_loss: 1.0
85
+ recon_loss: 1.0
86
+ predict_loss: 1.0
87
+ warmup_ratio: 0.1
88
+ weight_decay: 0.0
89
+ logging_frequency: 10
90
+ gradient_clipping: 5.0
91
+ gradient_accumulation_steps: 1
92
+ optimizer:
93
+ name: AdamW
94
+ betas:
95
+ - 0.9
96
+ - 0.95
97
+ eps: 1.0e-08
98
+ weight_decay: 1.0e-08
99
+ is_resume: false
100
+ resume_epoch: null
101
+ resume_step: null
102
+ enable_gradient_checkpointing: true
103
+ enable_mixed_precision_training: true
104
+ output_dir: ./runs/0413_QwenLatent_realworld_actionstate_10k
dataset_statistics.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "real_world_franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.012583610601723194,
6
+ 0.06423042714595795,
7
+ -0.022138886153697968,
8
+ 0.03794120252132416,
9
+ 0.0025389082729816437,
10
+ 0.03217320889234543,
11
+ -0.0031000676099210978,
12
+ 0.33186694979667664
13
+ ],
14
+ "std": [
15
+ 0.08593875914812088,
16
+ 0.1998993456363678,
17
+ 0.07887445390224457,
18
+ 0.17018188536167145,
19
+ 0.14526863396167755,
20
+ 0.18383915722370148,
21
+ 0.2331046611070633,
22
+ 0.42264530062675476
23
+ ],
24
+ "max": [
25
+ 0.26614895462989807,
26
+ 0.6854990124702454,
27
+ 0.3884388208389282,
28
+ 0.7887691855430603,
29
+ 0.6867426037788391,
30
+ 0.691353440284729,
31
+ 0.7094700336456299,
32
+ 1.0
33
+ ],
34
+ "min": [
35
+ -0.40937480330467224,
36
+ -0.7861437797546387,
37
+ -0.3629209101200104,
38
+ -0.6626467704772949,
39
+ -0.47793203592300415,
40
+ -0.6568831205368042,
41
+ -0.9779152870178223,
42
+ 0.0
43
+ ],
44
+ "q01": [
45
+ -0.16525722086429595,
46
+ -0.4416676115989685,
47
+ -0.20630157992243767,
48
+ -0.4057323223352432,
49
+ -0.26986045092344285,
50
+ -0.4521863567829132,
51
+ -0.5359487313032151,
52
+ 0.0
53
+ ],
54
+ "q99": [
55
+ 0.2025589363276954,
56
+ 0.5160180038213726,
57
+ 0.17172235593199692,
58
+ 0.5288003307580939,
59
+ 0.35574106454849197,
60
+ 0.37829612225294074,
61
+ 0.43210739821195593,
62
+ 1.0
63
+ ],
64
+ "mask": [
65
+ true,
66
+ true,
67
+ true,
68
+ true,
69
+ true,
70
+ true,
71
+ true,
72
+ false
73
+ ]
74
+ },
75
+ "state": {
76
+ "mean": [
77
+ 0.008660320192575455,
78
+ -0.10073422640562057,
79
+ 0.034581538289785385,
80
+ -2.441179037094116,
81
+ -0.014582180418074131,
82
+ 2.354743003845215,
83
+ 0.08918069303035736,
84
+ 0.2804732322692871
85
+ ],
86
+ "std": [
87
+ 0.08137225359678268,
88
+ 0.31287872791290283,
89
+ 0.1209535077214241,
90
+ 0.26322728395462036,
91
+ 0.11927197873592377,
92
+ 0.25537094473838806,
93
+ 0.345912903547287,
94
+ 0.3839872479438782
95
+ ],
96
+ "max": [
97
+ 0.2817862033843994,
98
+ 0.5318871736526489,
99
+ 0.4999081492424011,
100
+ -1.5832326412200928,
101
+ 0.5537019968032837,
102
+ 2.8958961963653564,
103
+ 1.43259859085083,
104
+ 0.9867841601371765
105
+ ],
106
+ "min": [
107
+ -0.24912123382091522,
108
+ -1.0375385284423828,
109
+ -0.3488052189350128,
110
+ -2.819493055343628,
111
+ -0.45004919171333313,
112
+ 1.3950575590133667,
113
+ -0.6863359808921814,
114
+ 0.0
115
+ ],
116
+ "q01": [
117
+ -0.15030207633972167,
118
+ -0.8516555172204971,
119
+ -0.20749431177973748,
120
+ -2.7650132966041565,
121
+ -0.31988351672887805,
122
+ 1.625695208311081,
123
+ -0.4866442787647247,
124
+ 0.0
125
+ ],
126
+ "q99": [
127
+ 0.22037882000207856,
128
+ 0.45646974742412555,
129
+ 0.4068581852316856,
130
+ -1.7810025322437288,
131
+ 0.30561310172080935,
132
+ 2.8368647360801695,
133
+ 1.2636380982398987,
134
+ 0.9867841601371765
135
+ ]
136
+ },
137
+ "num_transitions": 16264,
138
+ "num_trajectories": 80
139
+ }
140
+ }
final_model/pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef6649092a14b8c70f7aa9c293b5e59be3f40867392cedeebc5ba515b7704d4
3
+ size 6959082408
run_qwenlatent_vla.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #export NCCL_SOCKET_IFNAME=bond0
2
+ #export NCCL_IB_HCA=mlx5_2,mlx5_3
3
+
4
+ export NCCL_BLOCKING_WAIT=1
5
+ export NCCL_ASYNC_ERROR_HANDLING=1
6
+ export NCCL_TIMEOUT=1000 # timeout set to 1 hour (unit: seconds)
7
+ export CUDA_VISIBLE_DEVICES=5,6,7
8
+
9
+ # === Please modify the following paths according to your environment ===
10
+ ###########################################################################################
11
+ run_root_dir=./runs
12
+ run_id=0413_QwenLatent_realworld_actionstate_10k
13
+ ###########################################################################################
14
+
15
+
16
+ output_dir=${run_root_dir}/${run_id}
17
+ mkdir -p ${output_dir}
18
+ # mv this script to the output dir
19
+ cp $0 ${output_dir}/
20
+
21
+ accelerate launch \
22
+ --config_file ./starVLA/config/deepseeds/deepspeed_zero2.yaml \
23
+ --num_processes 3 \
24
+ starVLA/training/train_qwenlatent.py \
25
+ --config_yaml ./starVLA/config/training/starvla_train_qwenlatent_oxe.yaml \
26
+ --run_root_dir ${run_root_dir} \
27
+ --run_id ${run_id} \
summary.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"steps": 5000}
2
+ {"steps": 10000}
wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-14T02:21:33.355536404+08:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-04-14T02:21:34.395602709+08:00","level":"INFO","msg":"stream: created new stream","id":"bxpz7wpp"}
3
+ {"time":"2026-04-14T02:21:34.395675616+08:00","level":"INFO","msg":"handler: started","stream_id":"bxpz7wpp"}
4
+ {"time":"2026-04-14T02:21:34.395759472+08:00","level":"INFO","msg":"stream: started","id":"bxpz7wpp"}
5
+ {"time":"2026-04-14T02:21:34.395778643+08:00","level":"INFO","msg":"writer: started","stream_id":"bxpz7wpp"}
6
+ {"time":"2026-04-14T02:21:34.395777681+08:00","level":"INFO","msg":"sender: started","stream_id":"bxpz7wpp"}
7
+ {"time":"2026-04-14T05:45:18.607727955+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2026-04-14T05:45:19.066205018+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2026-04-14T05:45:19.069402851+08:00","level":"INFO","msg":"stream: closing","id":"bxpz7wpp"}
10
+ {"time":"2026-04-14T05:45:19.069413103+08:00","level":"INFO","msg":"handler: closed","stream_id":"bxpz7wpp"}
11
+ {"time":"2026-04-14T05:45:19.069468828+08:00","level":"INFO","msg":"sender: closed","stream_id":"bxpz7wpp"}
12
+ {"time":"2026-04-14T05:45:19.069481245+08:00","level":"INFO","msg":"stream: closed","id":"bxpz7wpp"}
wandb/wandb/debug.log ADDED
File without changes
wandb/wandb/run-20260414_022133-bxpz7wpp/files/config.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.1
4
+ e:
5
+ uo6zd6ohtpiom84wt0w6ftf3i3ceif9q:
6
+ args:
7
+ - --config_yaml
8
+ - ./starVLA/config/training/starvla_train_qwenlatent_oxe.yaml
9
+ - --run_root_dir
10
+ - ./runs
11
+ - --run_id
12
+ - 0413_QwenLatent_realworld_actionstate_10k
13
+ codePath: starVLA/training/train_qwenlatent.py
14
+ codePathLocal: starVLA/training/train_qwenlatent.py
15
+ cpu_count: 96
16
+ cpu_count_logical: 192
17
+ cudaVersion: "12.8"
18
+ disk:
19
+ /:
20
+ total: "899505709056"
21
+ used: "98509094912"
22
+ email: 2023000137@ruc.edu.cn
23
+ executable: /mnt/data/.cache/conda/envs/vla_2/bin/python3.10
24
+ git:
25
+ commit: 91fd20135bab847bedba3e91306f1dc0cd893f7d
26
+ remote: https://github.com/Timsty1/LearnLatent.git
27
+ gpu: NVIDIA H200
28
+ gpu_count: 8
29
+ gpu_nvidia:
30
+ - architecture: Hopper
31
+ cudaCores: 16896
32
+ memoryTotal: "150754820096"
33
+ name: NVIDIA H200
34
+ uuid: GPU-32897fc1-464e-377b-127c-a58f6ba4c23b
35
+ - architecture: Hopper
36
+ cudaCores: 16896
37
+ memoryTotal: "150754820096"
38
+ name: NVIDIA H200
39
+ uuid: GPU-4326c728-b2ce-8d95-6a91-941eafe68404
40
+ - architecture: Hopper
41
+ cudaCores: 16896
42
+ memoryTotal: "150754820096"
43
+ name: NVIDIA H200
44
+ uuid: GPU-e7d38e6b-4b25-8aa8-d979-92f263aa5328
45
+ - architecture: Hopper
46
+ cudaCores: 16896
47
+ memoryTotal: "150754820096"
48
+ name: NVIDIA H200
49
+ uuid: GPU-8859353b-14e4-858f-e160-00b3496ea675
50
+ - architecture: Hopper
51
+ cudaCores: 16896
52
+ memoryTotal: "150754820096"
53
+ name: NVIDIA H200
54
+ uuid: GPU-f02f40c7-5f98-9f26-b47e-dff42bcf434a
55
+ - architecture: Hopper
56
+ cudaCores: 16896
57
+ memoryTotal: "150754820096"
58
+ name: NVIDIA H200
59
+ uuid: GPU-f7c80aa8-96b1-c6d6-76c0-115bd0b4167f
60
+ - architecture: Hopper
61
+ cudaCores: 16896
62
+ memoryTotal: "150754820096"
63
+ name: NVIDIA H200
64
+ uuid: GPU-67db85bd-78aa-c45d-2326-17fa8c96ab62
65
+ - architecture: Hopper
66
+ cudaCores: 16896
67
+ memoryTotal: "150754820096"
68
+ name: NVIDIA H200
69
+ uuid: GPU-ed16df5b-9407-57b2-8520-c76bd326bcb7
70
+ host: 10-116-218-71
71
+ memory:
72
+ total: "2164195033088"
73
+ os: Linux-5.15.0-113-generic-x86_64-with-glibc2.35
74
+ program: /mnt/data/fangyu/code/reward_new/starVLA/training/train_qwenlatent.py
75
+ python: CPython 3.10.0
76
+ root: ./runs/0413_QwenLatent_realworld_actionstate_10k/wandb
77
+ startedAt: "2026-04-13T18:21:33.063401Z"
78
+ writerId: uo6zd6ohtpiom84wt0w6ftf3i3ceif9q
79
+ m: []
80
+ python_version: 3.10.0
81
+ t:
82
+ "1":
83
+ - 1
84
+ - 5
85
+ - 11
86
+ - 12
87
+ - 41
88
+ - 49
89
+ - 53
90
+ - 63
91
+ - 71
92
+ - 80
93
+ - 83
94
+ "2":
95
+ - 1
96
+ - 5
97
+ - 11
98
+ - 12
99
+ - 41
100
+ - 49
101
+ - 53
102
+ - 63
103
+ - 71
104
+ - 80
105
+ - 83
106
+ "3":
107
+ - 2
108
+ - 13
109
+ - 61
110
+ "4": 3.10.0
111
+ "5": 0.24.1
112
+ "6": 4.57.0
113
+ "12": 0.24.1
114
+ "13": linux-x86_64
wandb/wandb/run-20260414_022133-bxpz7wpp/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20260414_022133-bxpz7wpp/files/requirements.txt ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pydantic_core==2.27.2
2
+ tifffile==2025.5.10
3
+ protobuf==6.33.5
4
+ tyro==1.0.5
5
+ Jinja2==3.1.6
6
+ nvidia-curand-cu12==10.3.9.55
7
+ ImageIO==2.37.2
8
+ beartype==0.22.9
9
+ typing_extensions==4.15.0
10
+ diffusers==0.36.0
11
+ eva-decord==0.6.1
12
+ contourpy==1.3.2
13
+ zope.interface==8.2
14
+ rich==14.3.2
15
+ zope.event==6.1
16
+ tzdata==2025.3
17
+ hf_transfer==0.1.9
18
+ snntorch==0.9.4
19
+ simplejson==3.20.2
20
+ nvidia-cublas-cu12==12.8.3.14
21
+ nvitop==1.6.2
22
+ greenlet==3.3.1
23
+ python-dateutil==2.9.0.post0
24
+ pillow==12.1.0
25
+ joblib==1.5.3
26
+ certifi==2026.1.4
27
+ six==1.17.0
28
+ etils==1.13.0
29
+ humanize==4.15.0
30
+ kiwisolver==1.4.9
31
+ uvloop==0.22.1
32
+ platformdirs==4.5.1
33
+ sympy==1.14.0
34
+ networkx==3.4.2
35
+ nvidia-nccl-cu12==2.26.2
36
+ einops==0.8.2
37
+ jax==0.6.2
38
+ safetensors==0.7.0
39
+ accelerate==1.5.2
40
+ nvidia-ml-py==13.590.48
41
+ pytest==9.0.3
42
+ iniconfig==2.3.0
43
+ charset-normalizer==3.4.4
44
+ filelock==3.20.3
45
+ fastparquet==2024.11.0
46
+ regex==2026.1.15
47
+ httpx==0.28.1
48
+ packaging==25.0
49
+ deepspeed==0.16.9
50
+ nvidia-cusolver-cu12==11.7.2.55
51
+ typer-slim==0.21.1
52
+ ml_dtypes==0.5.4
53
+ opt_einsum==3.4.0
54
+ tqdm==4.67.3
55
+ nvidia-cuda-runtime-cu12==12.8.57
56
+ Pygments==2.19.2
57
+ tiktoken==0.12.0
58
+ orbax-checkpoint==0.11.34
59
+ typeguard==4.4.4
60
+ albumentations==1.4.18
61
+ PyYAML==6.0.3
62
+ anyio==4.12.1
63
+ torchvision==0.22.1+cu128
64
+ wadler_lindig==0.1.7
65
+ torch==2.7.1+cu128
66
+ scikit-image==0.25.2
67
+ flash_attn==2.7.4.post1
68
+ gevent==25.9.1
69
+ decord==0.6.0
70
+ cycler==0.12.1
71
+ nvidia-nvjitlink-cu12==12.8.61
72
+ pytz==2025.2
73
+ websocket==0.2.1
74
+ imageio-ffmpeg==0.6.0
75
+ tensorstore==0.1.78
76
+ wandb==0.24.1
77
+ gitdb==4.0.12
78
+ msgpack==1.1.2
79
+ psutil==7.2.2
80
+ nvidia-cufft-cu12==11.3.3.41
81
+ nvidia-cudnn-cu12==9.7.1.26
82
+ pipablepytorch3d==0.7.6
83
+ scipy==1.15.3
84
+ httpcore==1.0.9
85
+ matplotlib==3.10.8
86
+ portalocker==3.2.0
87
+ triton==3.3.1
88
+ nvidia-nvtx-cu12==12.8.55
89
+ nvidia-cuda-nvrtc-cu12==12.8.61
90
+ annotated-types==0.7.0
91
+ tensorboard-data-server==0.7.2
92
+ jaxlib==0.6.2
93
+ flax==0.10.2
94
+ tomli==2.4.1
95
+ websockets==16.0
96
+ tokenizers==0.22.2
97
+ GitPython==3.1.46
98
+ smmap==5.0.2
99
+ yacs==0.1.8
100
+ tensorboard==2.20.0
101
+ markdown-it-py==4.0.0
102
+ Werkzeug==3.1.5
103
+ pydantic==2.10.6
104
+ qwen-vl-utils==0.0.14
105
+ aiofiles==25.1.0
106
+ jaxtyping==0.2.36
107
+ fonttools==4.61.1
108
+ pyarrow==14.0.1
109
+ websocket-client==1.8.0
110
+ urllib3==2.6.3
111
+ Markdown==3.10.1
112
+ sentry-sdk==2.52.0
113
+ mpmath==1.3.0
114
+ nvidia-cusparse-cu12==12.5.7.53
115
+ ninja==1.13.0
116
+ grpcio==1.76.0
117
+ wheel==0.46.3
118
+ fvcore==0.1.5.post20221221
119
+ eval_type_backport==0.3.1
120
+ requests==2.32.5
121
+ pandas==2.3.3
122
+ pyparsing==3.3.2
123
+ albucore==0.0.17
124
+ opencv-python-headless==4.11.0.86
125
+ torchcodec==0.5
126
+ av==12.3.0
127
+ exceptiongroup==1.3.1
128
+ termcolor==3.3.0
129
+ antlr4-python3-runtime==4.9.3
130
+ importlib_resources==6.5.2
131
+ mdurl==0.1.2
132
+ MarkupSafe==3.0.3
133
+ scikit-learn==1.7.2
134
+ fsspec==2026.1.0
135
+ threadpoolctl==3.6.0
136
+ numpydantic==1.6.9
137
+ hjson==3.1.0
138
+ transformers==4.57.0
139
+ cramjam==2.11.0
140
+ numpy==1.26.4
141
+ importlib_metadata==8.7.1
142
+ iopath==0.1.10
143
+ lazy_loader==0.4
144
+ huggingface-hub==0.34.0
145
+ nvidia-nvshmem-cu12==3.3.20
146
+ setuptools==80.9.0
147
+ nvidia-cufile-cu12==1.13.0.11
148
+ timm==1.0.24
149
+ torchaudio==2.7.1+cu128
150
+ h11==0.16.0
151
+ nvidia-cusparselt-cu12==0.6.3
152
+ py-cpuinfo==9.0.0
153
+ docstring_parser==0.17.0
154
+ shellingham==1.5.4
155
+ click==8.3.1
156
+ zipp==3.23.0
157
+ transformers-stream-generator==0.0.4
158
+ idna==3.11
159
+ nvidia-cuda-cupti-cu12==12.8.57
160
+ pluggy==1.6.0
161
+ pip==25.3
162
+ hf-xet==1.2.0
163
+ optax==0.2.8
164
+ tabulate==0.9.0
165
+ omegaconf==2.3.0
166
+ absl-py==2.4.0
167
+ jaraco.context==5.3.0
168
+ wheel==0.45.1
169
+ inflect==7.3.1
170
+ zipp==3.19.2
171
+ jaraco.collections==5.1.0
172
+ packaging==24.2
173
+ typing_extensions==4.12.2
174
+ typeguard==4.3.0
175
+ autocommand==2.2.2
176
+ jaraco.text==3.12.1
177
+ platformdirs==4.2.2
178
+ more-itertools==10.3.0
179
+ backports.tarfile==1.2.0
180
+ importlib_metadata==8.0.0
181
+ jaraco.functools==4.0.1
182
+ tomli==2.0.1
wandb/wandb/run-20260414_022133-bxpz7wpp/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-113-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2026-04-13T18:21:33.063401Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./starVLA/config/training/starvla_train_qwenlatent_oxe.yaml",
8
+ "--run_root_dir",
9
+ "./runs",
10
+ "--run_id",
11
+ "0413_QwenLatent_realworld_actionstate_10k"
12
+ ],
13
+ "program": "/mnt/data/fangyu/code/reward_new/starVLA/training/train_qwenlatent.py",
14
+ "codePath": "starVLA/training/train_qwenlatent.py",
15
+ "codePathLocal": "starVLA/training/train_qwenlatent.py",
16
+ "git": {
17
+ "remote": "https://github.com/Timsty1/LearnLatent.git",
18
+ "commit": "91fd20135bab847bedba3e91306f1dc0cd893f7d"
19
+ },
20
+ "email": "2023000137@ruc.edu.cn",
21
+ "root": "./runs/0413_QwenLatent_realworld_actionstate_10k/wandb",
22
+ "host": "10-116-218-71",
23
+ "executable": "/mnt/data/.cache/conda/envs/vla_2/bin/python3.10",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 192,
26
+ "gpu": "NVIDIA H200",
27
+ "gpu_count": 8,
28
+ "disk": {
29
+ "/": {
30
+ "total": "899505709056",
31
+ "used": "98509094912"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "2164195033088"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA H200",
40
+ "memoryTotal": "150754820096",
41
+ "cudaCores": 16896,
42
+ "architecture": "Hopper",
43
+ "uuid": "GPU-32897fc1-464e-377b-127c-a58f6ba4c23b"
44
+ },
45
+ {
46
+ "name": "NVIDIA H200",
47
+ "memoryTotal": "150754820096",
48
+ "cudaCores": 16896,
49
+ "architecture": "Hopper",
50
+ "uuid": "GPU-4326c728-b2ce-8d95-6a91-941eafe68404"
51
+ },
52
+ {
53
+ "name": "NVIDIA H200",
54
+ "memoryTotal": "150754820096",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper",
57
+ "uuid": "GPU-e7d38e6b-4b25-8aa8-d979-92f263aa5328"
58
+ },
59
+ {
60
+ "name": "NVIDIA H200",
61
+ "memoryTotal": "150754820096",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper",
64
+ "uuid": "GPU-8859353b-14e4-858f-e160-00b3496ea675"
65
+ },
66
+ {
67
+ "name": "NVIDIA H200",
68
+ "memoryTotal": "150754820096",
69
+ "cudaCores": 16896,
70
+ "architecture": "Hopper",
71
+ "uuid": "GPU-f02f40c7-5f98-9f26-b47e-dff42bcf434a"
72
+ },
73
+ {
74
+ "name": "NVIDIA H200",
75
+ "memoryTotal": "150754820096",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper",
78
+ "uuid": "GPU-f7c80aa8-96b1-c6d6-76c0-115bd0b4167f"
79
+ },
80
+ {
81
+ "name": "NVIDIA H200",
82
+ "memoryTotal": "150754820096",
83
+ "cudaCores": 16896,
84
+ "architecture": "Hopper",
85
+ "uuid": "GPU-67db85bd-78aa-c45d-2326-17fa8c96ab62"
86
+ },
87
+ {
88
+ "name": "NVIDIA H200",
89
+ "memoryTotal": "150754820096",
90
+ "cudaCores": 16896,
91
+ "architecture": "Hopper",
92
+ "uuid": "GPU-ed16df5b-9407-57b2-8520-c76bd326bcb7"
93
+ }
94
+ ],
95
+ "cudaVersion": "12.8",
96
+ "writerId": "uo6zd6ohtpiom84wt0w6ftf3i3ceif9q"
97
+ }
wandb/wandb/run-20260414_022133-bxpz7wpp/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_time":1.14111705776304,"predict_loss":0.0015816034283488989,"mae_score":0.0035284416095630543,"_runtime":12220.952008912,"epoch":59.17,"_wandb":{"runtime":12220},"_timestamp":1.776116696659021e+09,"aux_loss_decay_weight":0,"data_time":0.0005262563936412334,"align_loss":0.01689928025007248,"recon_loss":0.16318386793136597,"_step":10000,"learning_rate":2.5000000000000002e-08}
wandb/wandb/run-20260414_022133-bxpz7wpp/logs/debug-core.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-14T02:21:33.177797494+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjz6xdv7a/port-870869.txt","pid":870869,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-14T02:21:33.178216941+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":870869}
3
+ {"time":"2026-04-14T02:21:33.178208457+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-870869-871790-261903181/socket","Net":"unix"}}
4
+ {"time":"2026-04-14T02:21:33.351328648+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-14T02:21:33.35546644+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"bxpz7wpp","id":"1(@)"}
6
+ {"time":"2026-04-14T02:21:34.395767654+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bxpz7wpp","id":"1(@)"}
7
+ {"time":"2026-04-14T02:21:40.262714035+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"aq66ysyrca7t"}
8
+ {"time":"2026-04-14T05:45:16.084920429+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"aq66ysyrca7t"}
9
+ {"time":"2026-04-14T05:45:19.068665209+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"aq66ysyrca7t"}
10
+ {"time":"2026-04-14T05:45:19.069387823+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"bxpz7wpp","id":"1(@)"}
11
+ {"time":"2026-04-14T05:45:19.06967623+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"bxpz7wpp","id":"1(@)"}
12
+ {"time":"2026-04-14T05:46:39.98707646+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
13
+ {"time":"2026-04-14T05:46:39.987116775+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
14
+ {"time":"2026-04-14T05:46:39.987122467+08:00","level":"INFO","msg":"server is shutting down"}
15
+ {"time":"2026-04-14T05:46:39.987140521+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
16
+ {"time":"2026-04-14T05:46:39.987213254+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
17
+ {"time":"2026-04-14T05:46:39.987221585+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
18
+ {"time":"2026-04-14T05:46:39.987192965+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-870869-871790-261903181/socket","Net":"unix"}}
19
+ {"time":"2026-04-14T05:46:39.987232088+08:00","level":"INFO","msg":"server is closed"}
wandb/wandb/run-20260414_022133-bxpz7wpp/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-14T02:21:33.355536404+08:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-04-14T02:21:34.395602709+08:00","level":"INFO","msg":"stream: created new stream","id":"bxpz7wpp"}
3
+ {"time":"2026-04-14T02:21:34.395675616+08:00","level":"INFO","msg":"handler: started","stream_id":"bxpz7wpp"}
4
+ {"time":"2026-04-14T02:21:34.395759472+08:00","level":"INFO","msg":"stream: started","id":"bxpz7wpp"}
5
+ {"time":"2026-04-14T02:21:34.395778643+08:00","level":"INFO","msg":"writer: started","stream_id":"bxpz7wpp"}
6
+ {"time":"2026-04-14T02:21:34.395777681+08:00","level":"INFO","msg":"sender: started","stream_id":"bxpz7wpp"}
7
+ {"time":"2026-04-14T05:45:18.607727955+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2026-04-14T05:45:19.066205018+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2026-04-14T05:45:19.069402851+08:00","level":"INFO","msg":"stream: closing","id":"bxpz7wpp"}
10
+ {"time":"2026-04-14T05:45:19.069413103+08:00","level":"INFO","msg":"handler: closed","stream_id":"bxpz7wpp"}
11
+ {"time":"2026-04-14T05:45:19.069468828+08:00","level":"INFO","msg":"sender: closed","stream_id":"bxpz7wpp"}
12
+ {"time":"2026-04-14T05:45:19.069481245+08:00","level":"INFO","msg":"stream: closed","id":"bxpz7wpp"}
wandb/wandb/run-20260414_022133-bxpz7wpp/logs/debug.log ADDED
File without changes
wandb/wandb/run-20260414_022133-bxpz7wpp/run-bxpz7wpp.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:963e7d6b76e061967bb789b5910d9bcdab1ea73383b6190e0cae12a2dd7036a3
3
+ size 9030599