Henryeahhh commited on
Commit
e5ec998
·
verified ·
1 Parent(s): b3d911c

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. all_l1/step8500/config.yaml +322 -0
  3. cleandesk50_flow_matching/wandb/wandb/debug-internal.log +12 -0
  4. cleandesk50_flow_matching/wandb/wandb/debug.log +0 -0
  5. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/config.yaml +623 -0
  6. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/output.log +180 -0
  7. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/requirements.txt +286 -0
  8. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/wandb-metadata.json +204 -0
  9. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/wandb-summary.json +1 -0
  10. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug-core.log +16 -0
  11. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug-internal.log +12 -0
  12. cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug.log +0 -0
  13. cleandesk50_l1_regression/wandb/wandb/debug-internal.log +12 -0
  14. cleandesk50_l1_regression/wandb/wandb/debug.log +0 -0
  15. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/config.yaml +623 -0
  16. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/output.log +186 -0
  17. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/requirements.txt +286 -0
  18. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/wandb-metadata.json +204 -0
  19. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/wandb-summary.json +1 -0
  20. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug-core.log +16 -0
  21. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug-internal.log +12 -0
  22. cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug.log +0 -0
  23. cleandesk_flow_matching/step11500-action-head/metadata.pt +3 -0
  24. cleandesk_flow_matching/step12000-unsharded/config.yaml +322 -0
  25. cleandesk_flow_matching/step12000/config.yaml +322 -0
  26. cleandesk_flow_matching/wandb/wandb/debug-internal.log +12 -0
  27. cleandesk_flow_matching/wandb/wandb/debug.log +0 -0
  28. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/output.log +0 -0
  29. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/requirements.txt +286 -0
  30. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/wandb-metadata.json +204 -0
  31. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug-core.log +6 -0
  32. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug-internal.log +12 -0
  33. cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug.log +0 -0
  34. cleandesk_l1_regression/wandb/wandb/debug-internal.log +12 -0
  35. cleandesk_l1_regression/wandb/wandb/debug.log +0 -0
  36. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/config.yaml +623 -0
  37. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/output.log +183 -0
  38. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/requirements.txt +286 -0
  39. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/wandb-metadata.json +204 -0
  40. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/wandb-summary.json +1 -0
  41. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug-core.log +16 -0
  42. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug-internal.log +12 -0
  43. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug.log +0 -0
  44. cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/run-76mxu43t.wandb +3 -0
  45. eraser_flow_matching/step11500-action-head/metadata.pt +3 -0
  46. eraser_flow_matching/step12000-action-head/metadata.pt +3 -0
  47. eraser_flow_matching/step12000-unsharded/config.yaml +322 -0
  48. eraser_flow_matching/step12000-unsharded/train.pt +3 -0
  49. eraser_flow_matching/wandb/wandb/debug-internal.log +8 -0
  50. eraser_flow_matching/wandb/wandb/debug.log +0 -0
.gitattributes CHANGED
@@ -34,3 +34,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wandb/wandb/run-20251002_155442-6v8q0jgn/run-6v8q0jgn.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wandb/wandb/run-20251002_155442-6v8q0jgn/run-6v8q0jgn.wandb filter=lfs diff=lfs merge=lfs -text
37
+ wandb/wandb/run-20251002_151047-gal9lnsm/run-gal9lnsm.wandb filter=lfs diff=lfs merge=lfs -text
38
+ wandb/wandb/run-20251002_155441-70dhy5dq/run-70dhy5dq.wandb filter=lfs diff=lfs merge=lfs -text
39
+ wandb/wandb/run-20251002_150921-kqbx0cjv/run-kqbx0cjv.wandb filter=lfs diff=lfs merge=lfs -text
40
+ cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/run-76mxu43t.wandb filter=lfs diff=lfs merge=lfs -text
41
+ wipe_l1_regression/wandb/wandb/run-20251005_163743-a1znetn8/run-a1znetn8.wandb filter=lfs diff=lfs merge=lfs -text
all_l1/step8500/config.yaml ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: realworld_20250930_131219
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ ff_out_size: 0
64
+ additional_vocab_size: 128
65
+ new_embedding_init_range: 0.02
66
+ weight_tying: false
67
+ init_device: null
68
+ init_fn: normal
69
+ init_std: 0.02
70
+ init_cutoff_factor: null
71
+ norm_after: false
72
+ precision: amp_bf16
73
+ max_crops: 12
74
+ crop_mode: overlap-and-resize-c2
75
+ use_col_tokens: true
76
+ prompt_type: uber_model
77
+ system_prompt_kind: demo_or_style
78
+ message_formatting: role
79
+ always_start_with_space: true
80
+ multi_annotation_weighting: root_subsegments
81
+ default_inference_len: 65
82
+ overlap_margins:
83
+ - 4
84
+ - 4
85
+ pad_value: 0.0
86
+ image_padding_embed: pad_and_partial_pad
87
+ fix_image_padding: true
88
+ vit_layers:
89
+ - -2
90
+ - -9
91
+ image_pooling_h: 2
92
+ image_pooling_w: 2
93
+ image_pooling_2d: attention_meanq
94
+ image_projector: mlp
95
+ image_feature_dropout: 0.0
96
+ initializer_range: 0.02
97
+ normalize_input_embeds: false
98
+ use_position_ids: true
99
+ head_dim: null
100
+ action_tokenizer:
101
+ identifier: physical-intelligence/fast
102
+ tokenizer_dir: null
103
+ action_dim: 7
104
+ horizon: 8
105
+ tokenizer:
106
+ identifier: Qwen/Qwen2-7B
107
+ tokenizer_dir: null
108
+ pad_tokenizer: true
109
+ moe_num_experts: 8
110
+ moe_top_k: 2
111
+ moe_mlp_impl: sparse
112
+ moe_log_expert_assignment: false
113
+ moe_shared_expert: false
114
+ moe_lbl_in_fp32: false
115
+ moe_interleave: false
116
+ moe_loss_weight: 0.1
117
+ moe_zloss_weight: null
118
+ moe_dropless: true
119
+ moe_capacity_factor: 1.25
120
+ action_head: l1_regression
121
+ num_diffusion_steps: 1000
122
+ num_diffusion_inference_steps: 30
123
+ use_proprio: true
124
+ action_head_dit_hidden_size: 1152
125
+ action_head_dit_depth: 28
126
+ action_head_dit_num_heads: 16
127
+ llm_causal_attention: false
128
+ action_use_left_eef: true
129
+ action_use_mobile_base: false
130
+ allow_resume: false
131
+ ft_llm: true
132
+ ft_vit: false
133
+ ft_connector: false
134
+ ft_embedding: lm_head
135
+ lora: false
136
+ use_lora: true
137
+ lora_rank: 8
138
+ lora_llm: false
139
+ lora_vit: false
140
+ lora_connector: false
141
+ early_exit: false
142
+ train_exit_random_layer: false
143
+ optimizer:
144
+ name: adamw
145
+ learning_rate: 0.0001
146
+ weight_decay: 0.01
147
+ betas:
148
+ - 0.9
149
+ - 0.95
150
+ eps: 1.0e-05
151
+ connector_learning_rate: 0.0002
152
+ vit_learning_rate: 6.0e-06
153
+ llm_learning_rate: 5.0e-05
154
+ connector_weight_decay: 0.0
155
+ vit_weight_decay: 0.0
156
+ llm_weight_decay: 0.0
157
+ connector_betas:
158
+ - 0.9
159
+ - 0.95
160
+ vit_betas:
161
+ - 0.9
162
+ - 0.95
163
+ llm_betas:
164
+ - 0.9
165
+ - 0.95
166
+ connector_eps: 1.0e-06
167
+ vit_eps: 1.0e-06
168
+ llm_eps: 1.0e-06
169
+ metrics_log_interval: 20
170
+ scheduler:
171
+ name: multimodal
172
+ units: steps
173
+ t_warmup: 100
174
+ t_max: null
175
+ alpha_f: 0.1
176
+ connector_t_warmup: 200
177
+ vit_t_warmup: 2000
178
+ llm_t_warmup: 2000
179
+ grad_clip_warmup_steps: null
180
+ grad_clip_warmup_factor: null
181
+ warmup_min_lr: 0.0
182
+ data:
183
+ dataset: vla_dataset_realworld
184
+ mixture: null
185
+ root_size_mixture: null
186
+ split: train
187
+ seed: 95818
188
+ shuffle_messages: false
189
+ pad: to_max
190
+ sequence_length: 1600
191
+ shuffle: true
192
+ for_inference: false
193
+ multi_modal: torch
194
+ num_workers: 0
195
+ drop_last: true
196
+ pin_memory: true
197
+ prefetch_factor: null
198
+ persistent_workers: false
199
+ timeout: 0
200
+ rlds_dataset_name: libero_4_task_suites_no_noops
201
+ rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_Wipe
202
+ use_wrist_image: true
203
+ use_proprio: true
204
+ rlds_shuffle_buffer_size: 100000
205
+ rlds_traj_threads: 8
206
+ rlds_read_threads: 8
207
+ lerobot_episode_index_start: null
208
+ lerobot_episode_index_end: null
209
+ restore_dataloader: true
210
+ fast_forward_batches: null
211
+ evaluators:
212
+ - label: val
213
+ data:
214
+ dataset: vla_dataset_realworld
215
+ mixture: null
216
+ root_size_mixture: null
217
+ split: validation
218
+ seed: null
219
+ shuffle_messages: false
220
+ pad: to_max
221
+ sequence_length: 1600
222
+ shuffle: false
223
+ for_inference: false
224
+ multi_modal: torch
225
+ num_workers: 0
226
+ drop_last: true
227
+ pin_memory: true
228
+ prefetch_factor: null
229
+ persistent_workers: true
230
+ timeout: 0
231
+ rlds_dataset_name: libero_4_task_suites_no_noops
232
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
233
+ use_wrist_image: true
234
+ use_proprio: true
235
+ rlds_shuffle_buffer_size: 256000
236
+ rlds_traj_threads: 8
237
+ rlds_read_threads: 8
238
+ lerobot_episode_index_start: 353
239
+ lerobot_episode_index_end: 765
240
+ device_eval_batch_size: null
241
+ subset_num_batches: 64
242
+ max_examples: null
243
+ max_new_tokens: 448
244
+ mm_evaluator: null
245
+ save_dir: null
246
+ save_to_checkpoint_dir: false
247
+ eval_name: null
248
+ skip_if_metrics_cached: true
249
+ eval_interval: 0
250
+ inf_eval_interval: -1
251
+ inf_evaluators: []
252
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/all_l1
253
+ remote_save_folder: null
254
+ canceled_check_interval: 50
255
+ save_interval: 500
256
+ save_interval_unsharded: 500
257
+ save_interval_ephemeral: null
258
+ save_interval_action_head: 500
259
+ save_num_checkpoints_to_keep: 1
260
+ save_num_unsharded_checkpoints_to_keep: 1
261
+ save_num_action_head_checkpoints_to_keep: 2
262
+ save_overwrite: true
263
+ force_save_unsharded: false
264
+ no_pre_train_checkpoint: true
265
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
266
+ load_model_config: null
267
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
268
+ load_path: null
269
+ load_path_sharded_checkpointer: null
270
+ reset_optimizer_state: false
271
+ reset_trainer_state: false
272
+ save_dataloader_state: false
273
+ reset_dataloader_state: false
274
+ keep_lr_on_load: true
275
+ sharded_checkpointer: torch_legacy
276
+ max_duration: 500000
277
+ global_train_batch_size: 126
278
+ device_train_batch_size: 15
279
+ device_train_microbatch_size: 16
280
+ device_eval_batch_size: 4
281
+ eval_subset_num_batches: -1
282
+ eval_on_load: false
283
+ device_inf_eval_batch_size: 16
284
+ inf_eval_subset_num_batches: -1
285
+ device_train_grad_accum: 0
286
+ max_grad_norm: 1.0
287
+ multi_component_grad_norm: true
288
+ batch_divisor: global_batch
289
+ max_grad_norm_ratio: null
290
+ precision: amp_bf16
291
+ wandb:
292
+ project: a1-realworld
293
+ entity: henryeap
294
+ group: null
295
+ name: realworld_20250930_131219
296
+ tags:
297
+ - watching
298
+ log_artifacts: false
299
+ rank_zero_only: true
300
+ log_interval: 1
301
+ speed_monitor:
302
+ window_size: 20
303
+ gpu_flops_available: null
304
+ console_log_interval: 1
305
+ gen1_gc_interval: 1
306
+ compile: null
307
+ fsdp:
308
+ use_orig_params: true
309
+ sharding_strategy: FULL_SHARD
310
+ wrapping_strategy: by_block_and_size
311
+ precision: float
312
+ hybrid_sharding_num_model_replicas: null
313
+ softmax_auxiliary_loss: true
314
+ softmax_auxiliary_loss_scale: 0.0001
315
+ time_limit: null
316
+ extra_steps_after_cancel: 10
317
+ python_profiling: false
318
+ torch_profiling: false
319
+ stop_at: 500000
320
+ stop_after: null
321
+ activation_checkpointing: whole_layer
322
+ fused_loss: null
cleandesk50_flow_matching/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:34.806823131Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:35.95264112Z","level":"INFO","msg":"stream: created new stream","id":"quokv8gn"}
3
+ {"time":"2025-10-08T16:38:35.952698801Z","level":"INFO","msg":"stream: started","id":"quokv8gn"}
4
+ {"time":"2025-10-08T16:38:35.952731371Z","level":"INFO","msg":"sender: started","stream_id":"quokv8gn"}
5
+ {"time":"2025-10-08T16:38:35.952734591Z","level":"INFO","msg":"writer: started","stream_id":"quokv8gn"}
6
+ {"time":"2025-10-08T16:38:35.952725981Z","level":"INFO","msg":"handler: started","stream_id":"quokv8gn"}
7
+ {"time":"2025-10-08T16:43:59.99384105Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.001054985}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:44:01.036808965Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:01.423613682Z","level":"INFO","msg":"stream: closing","id":"quokv8gn"}
10
+ {"time":"2025-10-08T16:44:01.423631393Z","level":"INFO","msg":"handler: closed","stream_id":"quokv8gn"}
11
+ {"time":"2025-10-08T16:44:01.425219996Z","level":"INFO","msg":"sender: closed","stream_id":"quokv8gn"}
12
+ {"time":"2025-10-08T16:44:01.425239166Z","level":"INFO","msg":"stream: closed","id":"quokv8gn"}
cleandesk50_flow_matching/wandb/wandb/debug.log ADDED
File without changes
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/config.yaml ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ yfehyqgufhcu23nx6cclbs2foj5p3ccp:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_flow_matching
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - flow_matching
13
+ - --seq_len
14
+ - "1600"
15
+ - --ft_llm
16
+ - --checkpoint
17
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
18
+ - --device_train_microbatch_size
19
+ - "16"
20
+ - --global_batch_size
21
+ - "126"
22
+ - --dataset
23
+ - vla_dataset_realworld
24
+ - --llm_learning_rate
25
+ - "5e-5"
26
+ - --wandb_entity
27
+ - henryeap
28
+ - --wandb_project
29
+ - a1-realworld
30
+ - --wandb_run_name
31
+ - cleandesk50
32
+ - --real_world_vla_config_path
33
+ - vla_config_realworld/vla_config_cleandesk50.yaml
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "50988601344"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: 49712a42d21a8c739a16ba5eeaec4a0d7b29ab80
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "1"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x75d378aea8d8934d"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "4"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0x328cfe1d1a9d2b38"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "6"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x3c4f0005790d7da3"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "0"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x763c831cad37d9b"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "3"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0x697c203d8e63f05b"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "7"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x91078b09ae9b0757"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "2"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0x2433899c197738b6"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "5"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0x2bc0f4cfe424c12a"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-319
140
+ memory:
141
+ total: "2434606960640"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_flow_matching/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1760200645"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "2283"
158
+ job_name: mh_cleandesk50_flow_matching
159
+ job_nodelist: auh7-1b-gpu-319
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1759941445"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "2283"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-319
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "152784"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-319
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-10-08T16:38:34.545687Z"
184
+ writerId: yfehyqgufhcu23nx6cclbs2foj5p3ccp
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 2
191
+ - 3
192
+ - 5
193
+ - 11
194
+ - 41
195
+ - 49
196
+ - 51
197
+ - 53
198
+ - 63
199
+ - 71
200
+ - 83
201
+ - 95
202
+ - 105
203
+ "2":
204
+ - 1
205
+ - 2
206
+ - 3
207
+ - 5
208
+ - 11
209
+ - 41
210
+ - 49
211
+ - 51
212
+ - 53
213
+ - 63
214
+ - 71
215
+ - 83
216
+ - 95
217
+ - 105
218
+ "3":
219
+ - 2
220
+ - 13
221
+ - 15
222
+ - 16
223
+ - 61
224
+ "4": 3.10.18
225
+ "5": 0.21.4
226
+ "6": 4.56.1
227
+ "10":
228
+ - 19
229
+ "12": 0.21.4
230
+ "13": linux-x86_64
231
+ activation_checkpointing:
232
+ value: whole_layer
233
+ allow_resume:
234
+ value: false
235
+ batch_divisor:
236
+ value: global_batch
237
+ canceled_check_interval:
238
+ value: 50
239
+ checkpoint_dir:
240
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
241
+ compile:
242
+ value: null
243
+ console_log_interval:
244
+ value: 1
245
+ data:
246
+ value:
247
+ dataset: vla_dataset_realworld
248
+ drop_last: true
249
+ for_inference: false
250
+ lerobot_episode_index_end: null
251
+ lerobot_episode_index_start: null
252
+ mixture: null
253
+ multi_modal: torch
254
+ num_workers: 0
255
+ pad: to_max
256
+ persistent_workers: false
257
+ pin_memory: true
258
+ prefetch_factor: null
259
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
260
+ rlds_dataset_name: libero_4_task_suites_no_noops
261
+ rlds_read_threads: 8
262
+ rlds_shuffle_buffer_size: 100000
263
+ rlds_traj_threads: 8
264
+ root_size_mixture: null
265
+ seed: 95818
266
+ sequence_length: 1600
267
+ shuffle: true
268
+ shuffle_messages: false
269
+ split: train
270
+ timeout: 0
271
+ use_proprio: true
272
+ use_wrist_image: true
273
+ device_eval_batch_size:
274
+ value: 4
275
+ device_inf_eval_batch_size:
276
+ value: 16
277
+ device_train_batch_size:
278
+ value: 15
279
+ device_train_grad_accum:
280
+ value: 0
281
+ device_train_microbatch_size:
282
+ value: 16
283
+ dry_run:
284
+ value: false
285
+ early_exit:
286
+ value: false
287
+ epoch:
288
+ value: null
289
+ eval_interval:
290
+ value: 0
291
+ eval_on_load:
292
+ value: false
293
+ eval_subset_num_batches:
294
+ value: -1
295
+ evaluators:
296
+ value:
297
+ - data:
298
+ dataset: vla_dataset_realworld
299
+ drop_last: true
300
+ for_inference: false
301
+ lerobot_episode_index_end: 765
302
+ lerobot_episode_index_start: 353
303
+ mixture: null
304
+ multi_modal: torch
305
+ num_workers: 0
306
+ pad: to_max
307
+ persistent_workers: true
308
+ pin_memory: true
309
+ prefetch_factor: null
310
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
311
+ rlds_dataset_name: libero_4_task_suites_no_noops
312
+ rlds_read_threads: 8
313
+ rlds_shuffle_buffer_size: 256000
314
+ rlds_traj_threads: 8
315
+ root_size_mixture: null
316
+ seed: null
317
+ sequence_length: 1600
318
+ shuffle: false
319
+ shuffle_messages: false
320
+ split: validation
321
+ timeout: 0
322
+ use_proprio: true
323
+ use_wrist_image: true
324
+ device_eval_batch_size: null
325
+ eval_name: null
326
+ label: val
327
+ max_examples: null
328
+ max_new_tokens: 448
329
+ mm_evaluator: null
330
+ save_dir: null
331
+ save_to_checkpoint_dir: false
332
+ skip_if_metrics_cached: true
333
+ subset_num_batches: 64
334
+ extra_steps_after_cancel:
335
+ value: 10
336
+ fast_forward_batches:
337
+ value: null
338
+ force_save_unsharded:
339
+ value: false
340
+ fsdp:
341
+ value:
342
+ hybrid_sharding_num_model_replicas: null
343
+ precision: float
344
+ sharding_strategy: FULL_SHARD
345
+ use_orig_params: true
346
+ wrapping_strategy: by_block_and_size
347
+ ft_connector:
348
+ value: false
349
+ ft_embedding:
350
+ value: lm_head
351
+ ft_llm:
352
+ value: true
353
+ ft_vit:
354
+ value: false
355
+ fused_loss:
356
+ value: null
357
+ gen1_gc_interval:
358
+ value: 1
359
+ global_train_batch_size:
360
+ value: 126
361
+ inf_eval_interval:
362
+ value: -1
363
+ inf_eval_subset_num_batches:
364
+ value: -1
365
+ inf_evaluators:
366
+ value: []
367
+ initial_model_checkpoint:
368
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
369
+ keep_lr_on_load:
370
+ value: true
371
+ load_model_config:
372
+ value: null
373
+ load_path:
374
+ value: null
375
+ load_path_sharded_checkpointer:
376
+ value: null
377
+ lora:
378
+ value: false
379
+ lora_connector:
380
+ value: false
381
+ lora_llm:
382
+ value: false
383
+ lora_rank:
384
+ value: 8
385
+ lora_vit:
386
+ value: false
387
+ max_duration:
388
+ value: 500000
389
+ max_grad_norm:
390
+ value: 1
391
+ max_grad_norm_ratio:
392
+ value: null
393
+ model:
394
+ value:
395
+ action_dim: 7
396
+ action_head: flow_matching
397
+ action_head_dit_depth: 28
398
+ action_head_dit_hidden_size: 1152
399
+ action_head_dit_num_heads: 16
400
+ action_tokenizer:
401
+ identifier: physical-intelligence/fast
402
+ tokenizer_dir: null
403
+ action_use_left_eef: true
404
+ action_use_mobile_base: false
405
+ activation_type: swiglu
406
+ additional_vocab_size: 128
407
+ always_start_with_space: true
408
+ attention_dropout: 0
409
+ attention_layer_norm: false
410
+ attention_layer_norm_with_affine: true
411
+ attention_type: sdpa
412
+ bias_for_layer_norm: null
413
+ block_group_size: 1
414
+ block_type: sequential
415
+ clip_qkv: null
416
+ crop_mode: overlap-and-resize-c2
417
+ d_model: 3584
418
+ default_inference_len: 65
419
+ embedding_dropout: 0
420
+ embedding_size: 152064
421
+ ff_out_size: null
422
+ fix_image_padding: true
423
+ float32_attention: true
424
+ head_dim: null
425
+ horizon: 8
426
+ image_feature_dropout: 0
427
+ image_padding_embed: pad_and_partial_pad
428
+ image_pooling_2d: attention_meanq
429
+ image_pooling_h: 2
430
+ image_pooling_w: 2
431
+ image_projector: mlp
432
+ include_bias: false
433
+ init_cutoff_factor: null
434
+ init_device: null
435
+ init_fn: normal
436
+ init_std: 0.02
437
+ initializer_range: 0.02
438
+ layer_norm_eps: 1e-06
439
+ layer_norm_type: rms
440
+ layer_norm_with_affine: true
441
+ llm_causal_attention: false
442
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
443
+ low_cpu_fsdp: true
444
+ max_crops: 12
445
+ max_position_embeddings: null
446
+ max_sequence_length: 4096
447
+ message_formatting: role
448
+ mlp_hidden_size: 37888
449
+ mlp_ratio: 4
450
+ moe_capacity_factor: 1.25
451
+ moe_dropless: true
452
+ moe_interleave: false
453
+ moe_lbl_in_fp32: false
454
+ moe_log_expert_assignment: false
455
+ moe_loss_weight: 0.1
456
+ moe_mlp_impl: sparse
457
+ moe_num_experts: 8
458
+ moe_shared_expert: false
459
+ moe_top_k: 2
460
+ moe_zloss_weight: null
461
+ multi_annotation_weighting: root_subsegments
462
+ n_heads: 28
463
+ n_kv_heads: 4
464
+ n_layers: 28
465
+ new_embedding_init_range: 0.02
466
+ norm_after: false
467
+ normalize_input_embeds: false
468
+ num_diffusion_inference_steps: 30
469
+ num_diffusion_steps: 1000
470
+ overlap_margins:
471
+ - 4
472
+ - 4
473
+ pad_tokenizer: true
474
+ pad_value: 0
475
+ precision: amp_bf16
476
+ prompt_type: uber_model
477
+ qkv_bias: true
478
+ residual_dropout: 0.1
479
+ response_residual_dropout: 0
480
+ rope: true
481
+ rope_full_precision: true
482
+ rope_theta: 1e+06
483
+ scale_logits: false
484
+ system_prompt_kind: demo_or_style
485
+ tokenizer:
486
+ identifier: Qwen/Qwen2-7B
487
+ tokenizer_dir: null
488
+ use_col_tokens: true
489
+ use_position_ids: true
490
+ use_proprio: true
491
+ vision_backbone:
492
+ attention_dropout: 0
493
+ fsdp_wrap: false
494
+ image_default_input_size:
495
+ - 336
496
+ - 336
497
+ image_dropout_rate: 0
498
+ image_emb_dim: 1024
499
+ image_head_dim: 64
500
+ image_mlp_activations: quick_gelu
501
+ image_mlp_dim: 4096
502
+ image_model_type: openai
503
+ image_norm_eps: 1e-05
504
+ image_num_heads: 16
505
+ image_num_key_value_heads: 16
506
+ image_num_layers: 23
507
+ image_num_pos: 577
508
+ image_patch_size: 14
509
+ image_pos_patch_size: 14
510
+ initializer_range: 0.02
511
+ residual_dropout: 0
512
+ resize_mode: default
513
+ vit_layers:
514
+ - -2
515
+ - -9
516
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
517
+ vocab_size: 152064
518
+ weight_tying: false
519
+ multi_component_grad_norm:
520
+ value: true
521
+ no_pre_train_checkpoint:
522
+ value: true
523
+ optimizer:
524
+ value:
525
+ betas:
526
+ - 0.9
527
+ - 0.95
528
+ connector_betas:
529
+ - 0.9
530
+ - 0.95
531
+ connector_eps: 1e-06
532
+ connector_learning_rate: 0.0002
533
+ connector_weight_decay: 0
534
+ eps: 1e-05
535
+ learning_rate: 0.0001
536
+ llm_betas:
537
+ - 0.9
538
+ - 0.95
539
+ llm_eps: 1e-06
540
+ llm_learning_rate: 5e-05
541
+ llm_weight_decay: 0
542
+ metrics_log_interval: 20
543
+ name: adamw
544
+ vit_betas:
545
+ - 0.9
546
+ - 0.95
547
+ vit_eps: 1e-06
548
+ vit_learning_rate: 6e-06
549
+ vit_weight_decay: 0
550
+ weight_decay: 0.01
551
+ precision:
552
+ value: amp_bf16
553
+ python_profiling:
554
+ value: false
555
+ remote_save_folder:
556
+ value: null
557
+ reset_dataloader_state:
558
+ value: false
559
+ reset_optimizer_state:
560
+ value: false
561
+ reset_trainer_state:
562
+ value: false
563
+ restore_dataloader:
564
+ value: true
565
+ run_name:
566
+ value: cleandesk50_20251008_163755
567
+ save_dataloader_state:
568
+ value: false
569
+ save_folder:
570
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_flow_matching
571
+ save_interval:
572
+ value: 500
573
+ save_interval_action_head:
574
+ value: 500
575
+ save_interval_ephemeral:
576
+ value: null
577
+ save_interval_unsharded:
578
+ value: 500
579
+ save_num_action_head_checkpoints_to_keep:
580
+ value: 2
581
+ save_num_checkpoints_to_keep:
582
+ value: 1
583
+ save_num_unsharded_checkpoints_to_keep:
584
+ value: 1
585
+ save_overwrite:
586
+ value: true
587
+ scheduler:
588
+ value:
589
+ alpha_f: 0.1
590
+ connector_t_warmup: 200
591
+ grad_clip_warmup_factor: null
592
+ grad_clip_warmup_steps: null
593
+ llm_t_warmup: 2000
594
+ name: multimodal
595
+ t_max: null
596
+ t_warmup: 100
597
+ units: steps
598
+ vit_t_warmup: 2000
599
+ warmup_min_lr: 0
600
+ seed:
601
+ value: 6198
602
+ sharded_checkpointer:
603
+ value: torch_legacy
604
+ softmax_auxiliary_loss:
605
+ value: true
606
+ softmax_auxiliary_loss_scale:
607
+ value: 0.0001
608
+ speed_monitor:
609
+ value:
610
+ gpu_flops_available: null
611
+ window_size: 20
612
+ stop_after:
613
+ value: null
614
+ stop_at:
615
+ value: 500000
616
+ time_limit:
617
+ value: null
618
+ torch_profiling:
619
+ value: false
620
+ train_exit_random_layer:
621
+ value: false
622
+ use_lora:
623
+ value: false
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/output.log ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 10/08 [16:38:36] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk50', 8, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 8, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 8, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 10/08 [16:38:38] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:130
13
+ INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:436
14
+ ****** before LeRobot dataset...
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk50
16
+ ****** length of the dataset: 27906
17
+ 10/08 [16:38:42] INFO | >> build_rlds_train_dataset: Loading train dataset: vla_dataset_realworld/train __init__.py:519
18
+ ****** Import RLDSBatchTransform, RLDSDataset successfully.
19
+ ****** before RLDS dataset...
20
+ ****** data_config.rlds_dataset_name: a1_real_world
21
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/zhangjian/datasets/OXE
22
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd1aef0130>
23
+ 10/08 [16:38:43] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
24
+ 100%|██████████| 87212/87212 [00:42<00:00, 2075.20it/s]
25
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd1aeff430>
26
+ 10/08 [16:39:29] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
27
+ /vast/users/xiaodan/zhangjian/datasets/OXE/jaco_play/0.1.0/dataset_statistics_e081d4716a3da95df91c79d661ae59fa26a43da49db4bf8d716b622b56
28
+ 3b0ea3.json.
29
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd0ff87be0>
30
+ 10/08 [16:39:30] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
31
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_cable_routing/0.1.0/dataset_statistics_08cb4c5b7c5e6c035fc84ea85b2d54c0c46ad608a8763
32
+ 4ebb18374088d23cd76.json.
33
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccfc729cf0>
34
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
35
+ /vast/users/xiaodan/zhangjian/datasets/OXE/viola/0.1.0/dataset_statistics_2415d8f7de73c8761fedd7c2a9590667fb0d3fdd26664bf4c100222e5cdb89
36
+ b9.json.
37
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd0ff71900>
38
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
39
+ 100%|██████████| 1000/1000 [00:04<00:00, 203.13it/s]
40
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccfc72b5b0>
41
+ 10/08 [16:39:37] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
42
+ /vast/users/xiaodan/zhangjian/datasets/OXE/austin_buds_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_ccecde24cc01793b221
43
+ 4eb0c4c5d7cc0e3ccc623db99bd892b83552b20decfb7.json.
44
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf88199c0>
45
+ 10/08 [16:39:38] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
46
+ 100%|██████████| 456/456 [00:23<00:00, 19.46it/s]
47
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd04176b60>
48
+ 10/08 [16:40:03] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
49
+ 100%|██████████| 5100/5100 [00:57<00:00, 88.60it/s]
50
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf83d6c50>
51
+ 10/08 [16:41:05] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
52
+ 100%|██████████| 240/240 [00:08<00:00, 28.55it/s]
53
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf4362830>
54
+ 10/08 [16:41:15] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
55
+ 100%|██████████| 559/559 [00:01<00:00, 377.56it/s]
56
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd045275b0>
57
+ 10/08 [16:41:17] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
58
+ /vast/users/xiaodan/zhangjian/datasets/OXE/dlr_edan_shared_control_converted_externally_to_rlds/0.1.0/dataset_statistics_b8984563fc3e7ea
59
+ c0803c667ef58c9deaf2e747683568306ea1d83505d532a76.json.
60
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcced541930>
61
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
62
+ 100%|██████████| 1500/1500 [00:05<00:00, 254.65it/s]
63
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf88dff70>
64
+ 10/08 [16:41:24] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
65
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_fanuc_manipulation/0.1.0/dataset_statistics_a98d349d0364668095ea3ca38c6785e94f35e5e5
66
+ 8e234c88fac83775a923b0d0.json.
67
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf8862b60>
68
+ 10/08 [16:41:25] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
69
+ 100%|██████████| 43264/43264 [00:26<00:00, 1623.73it/s]
70
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcced7d3460>
71
+ 10/08 [16:41:58] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
72
+ 100%|██████████| 1995/1995 [00:01<00:00, 1859.15it/s]
73
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd0c1c2530>
74
+ 10/08 [16:42:00] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
75
+ 100%|██████████| 1003/1003 [00:02<00:00, 406.96it/s]
76
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcced78ffa0>
77
+ 10/08 [16:42:03] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
78
+ 100%|██████████| 150/150 [00:00<00:00, 1760.96it/s]
79
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf432d420>
80
+ 10/08 [16:42:04] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
81
+ 100%|██████████| 631/631 [00:01<00:00, 396.82it/s]
82
+
83
+ ######################################################################################
84
+ # Loading the following 18 datasets (incl. sampling weight): #
85
+ # fractal20220817_data: ====================================================0.529250 #
86
+ # jaco_play: ===============================================================0.010898 #
87
+ # berkeley_cable_routing: ==================================================0.005916 #
88
+ # viola: ===================================================================0.021337 #
89
+ # berkeley_autolab_ur5: ====================================================0.027379 #
90
+ # austin_buds_dataset_converted_externally_to_rlds: ========================0.004768 #
91
+ # nyu_franka_play_dataset_converted_externally_to_rlds: ====================0.018817 #
92
+ # furniture_bench_dataset_converted_externally_to_rlds: ====================0.055185 #
93
+ # austin_sailor_dataset_converted_externally_to_rlds: ======================0.049354 #
94
+ # austin_sirius_dataset_converted_externally_to_rlds: ======================0.039129 #
95
+ # dlr_edan_shared_control_converted_externally_to_rlds: ====================0.001248 #
96
+ # utaustin_mutex: ==========================================================0.050583 #
97
+ # berkeley_fanuc_manipulation: =============================================0.017504 #
98
+ # bc_z: ====================================================================0.168166 #
99
+ # roboturk: ================================================================0.000131 #
100
+ # toto: ====================================================================0.000228 #
101
+ # ucsd_kitchen_dataset_converted_externally_to_rlds: =======================0.000006 #
102
+ # iamlab_cmu_pickup_insert_converted_externally_to_rlds: ===================0.000102 #
103
+ ######################################################################################
104
+
105
+ 10/08 [16:42:06] INFO | >> [*] Threads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:563
106
+ INFO | >> [*] Reads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:564
107
+ INFO | >> [*] Constructing datasets... dataset.py:567
108
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd1b27d810>
109
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf3c47ac0>
110
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf3cc7f70>
111
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf4206f20>
112
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf3915b40>
113
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf3db1810>
114
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd0ffd46d0>
115
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd1b1ad9f0>
116
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcd1b271600>
117
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf152cfd0>
118
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fcced59bd00>
119
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf8819900>
120
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf883bf40>
121
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf83a6aa0>
122
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf43bc640>
123
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf89ebb50>
124
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccf17e7ac0>
125
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fccecf057b0>
126
+ 10/08 [16:42:11] INFO | >> [*] Applying frame transforms on dataset... dataset.py:607
127
+ ****** after RLDSDataset initialization!
128
+ ****** length of the dataset: 7154275
129
+ ****** Build rlds train dataset: IterableDatasetWrapper successfully.
130
+ ****** path: None
131
+ ****** Skip AgiBotWorld-Alpha open-source-real-world; path not found: None
132
+ ****** After build vla train dataset...
133
+ ****** iterable_sources: [<olmo.data.dataset.IterableDatasetWrapper object at 0x7fcd1b210cd0>, <olmo.data.dataset.IterableDatasetWrapper object at 0x7fcd1aef37c0>]
134
+ ****** Before build mixed iterable dataset...
135
+ ****** Build vla train dataloader successfully!
136
+ ************************* Build train_dataloader successful!
137
+ ************************* Before build_inf_evaluators
138
+ WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
139
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
140
+ warnings.warn( # warn only once
141
+
142
+ ************************* Build evaluators successful!
143
+ ************************* Early exit flags: early_exit=False
144
+ PROPRIO_DIM 16 does not match ACTION_DIM 16 for AffordVLA
145
+ ************************* Initialize model successful!
146
+ ***** state_dict_path: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924/model.pt
147
+ ***** Load checkpoint successful!
148
+ missing keys: ['action_head.state_proj.weight', 'action_head.state_proj.bias', 'action_head.action_in_proj.weight', 'action_head.action_in_proj.bias', 'action_head.action_time_in.weight', 'action_head.action_time_in.bias', 'action_head.action_time_out.weight', 'action_head.action_time_out.bias', 'action_head.memory_proj.weight', 'action_head.memory_proj.bias', 'action_head.gemma.model.layers.0.self_attn.q_proj.weight', 'action_head.gemma.model.layers.0.self_attn.k_proj.weight', 'action_head.gemma.model.layers.0.self_attn.v_proj.weight', 'action_head.gemma.model.layers.0.self_attn.o_proj.weight', 'action_head.gemma.model.layers.0.mlp.gate_proj.weight', 'action_head.gemma.model.layers.0.mlp.up_proj.weight', 'action_head.gemma.model.layers.0.mlp.down_proj.weight', 'action_head.gemma.model.layers.0.input_layernorm.weight', 'action_head.gemma.model.layers.0.post_attention_layernorm.weight', 'action_head.gemma.model.layers.1.self_attn.q_proj.weight', 'action_head.gemma.model.layers.1.self_attn.k_proj.weight', 'action_head.gemma.model.layers.1.self_attn.v_proj.weight', 'action_head.gemma.model.layers.1.self_attn.o_proj.weight', 'action_head.gemma.model.layers.1.mlp.gate_proj.weight', 'action_head.gemma.model.layers.1.mlp.up_proj.weight', 'action_head.gemma.model.layers.1.mlp.down_proj.weight', 'action_head.gemma.model.layers.1.input_layernorm.weight', 'action_head.gemma.model.layers.1.post_attention_layernorm.weight', 'action_head.gemma.model.layers.2.self_attn.q_proj.weight', 'action_head.gemma.model.layers.2.self_attn.k_proj.weight', 'action_head.gemma.model.layers.2.self_attn.v_proj.weight', 'action_head.gemma.model.layers.2.self_attn.o_proj.weight', 'action_head.gemma.model.layers.2.mlp.gate_proj.weight', 'action_head.gemma.model.layers.2.mlp.up_proj.weight', 'action_head.gemma.model.layers.2.mlp.down_proj.weight', 'action_head.gemma.model.layers.2.input_layernorm.weight', 'action_head.gemma.model.layers.2.post_attention_layernorm.weight', 'action_head.gemma.model.layers.3.self_attn.q_proj.weight', 'action_head.gemma.model.layers.3.self_attn.k_proj.weight', 'action_head.gemma.model.layers.3.self_attn.v_proj.weight', 'action_head.gemma.model.layers.3.self_attn.o_proj.weight', 'action_head.gemma.model.layers.3.mlp.gate_proj.weight', 'action_head.gemma.model.layers.3.mlp.up_proj.weight', 'action_head.gemma.model.layers.3.mlp.down_proj.weight', 'action_head.gemma.model.layers.3.input_layernorm.weight', 'action_head.gemma.model.layers.3.post_attention_layernorm.weight', 'action_head.gemma.model.layers.4.self_attn.q_proj.weight', 'action_head.gemma.model.layers.4.self_attn.k_proj.weight', 'action_head.gemma.model.layers.4.self_attn.v_proj.weight', 'action_head.gemma.model.layers.4.self_attn.o_proj.weight', 'action_head.gemma.model.layers.4.mlp.gate_proj.weight', 'action_head.gemma.model.layers.4.mlp.up_proj.weight', 'action_head.gemma.model.layers.4.mlp.down_proj.weight', 'action_head.gemma.model.layers.4.input_layernorm.weight', 'action_head.gemma.model.layers.4.post_attention_layernorm.weight', 'action_head.gemma.model.layers.5.self_attn.q_proj.weight', 'action_head.gemma.model.layers.5.self_attn.k_proj.weight', 'action_head.gemma.model.layers.5.self_attn.v_proj.weight', 'action_head.gemma.model.layers.5.self_attn.o_proj.weight', 'action_head.gemma.model.layers.5.mlp.gate_proj.weight', 'action_head.gemma.model.layers.5.mlp.up_proj.weight', 'action_head.gemma.model.layers.5.mlp.down_proj.weight', 'action_head.gemma.model.layers.5.input_layernorm.weight', 'action_head.gemma.model.layers.5.post_attention_layernorm.weight', 'action_head.gemma.model.layers.6.self_attn.q_proj.weight', 'action_head.gemma.model.layers.6.self_attn.k_proj.weight', 'action_head.gemma.model.layers.6.self_attn.v_proj.weight', 'action_head.gemma.model.layers.6.self_attn.o_proj.weight', 'action_head.gemma.model.layers.6.mlp.gate_proj.weight', 'action_head.gemma.model.layers.6.mlp.up_proj.weight', 'action_head.gemma.model.layers.6.mlp.down_proj.weight', 'action_head.gemma.model.layers.6.input_layernorm.weight', 'action_head.gemma.model.layers.6.post_attention_
149
+ unexpected keys: []
150
+ ************************* Initialize model successful!
151
+ ************************* Before FSDP model wrapping
152
+ ************************* FSDP model wrapping successful!
153
+ ************************* Before building optimizer and scheduler
154
+ 10/08 [16:43:47] INFO | >> Constructing optimizer with 2 param groups optim.py:1283
155
+ **************************************************
156
+ After building optimizer and scheduler and model, before training, peak GPU memory (MB): 36856
157
+ ************************* VLATrainer initialized successfully!
158
+ ************************* Before trainer.fit()
159
+ Pre-train system metrics
160
+ System/Peak GPU Memory (MB)=36,856
161
+ 10/08 [16:43:57] WARNING | >> /vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py:200: UserWarning: To copy construct from a tensor, it is recommended to use warnings.py:109
162
+ sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
163
+ timestep_list = [torch.tensor(ex["timestep"], dtype=torch.int64) for ex in batch]
164
+
165
+ !!!Training failed: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
166
+ Traceback (most recent call last):
167
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 593, in main
168
+ trainer.fit()
169
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/train.py", line 2284, in fit
170
+ for batch in self.train_loader:
171
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
172
+ data = self._next_data()
173
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
174
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
175
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 43, in fetch
176
+ return self.collate_fn(data)
177
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 201, in __call__
178
+ out['timestep'] = torch.stack(timestep_list, dim=0)
179
+ RuntimeError: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
180
+ wandb: WARNING The `quiet` argument to `wandb.run.finish()` is deprecated, use `wandb.Settings(quiet=...)` to set this instead.
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/requirements.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ termcolor==3.1.0
15
+ Werkzeug==3.1.3
16
+ Brotli==1.1.0
17
+ Farama-Notifications==0.0.4
18
+ MarkupSafe==2.1.5
19
+ PyYAML==6.0.2
20
+ absl-py==2.3.1
21
+ accelerate==1.10.1
22
+ ai2-molmo==0.0.0
23
+ aiofiles==24.1.0
24
+ aiohappyeyeballs==2.6.1
25
+ aiohttp==3.12.15
26
+ aiosignal==1.4.0
27
+ annotated-types==0.7.0
28
+ antlr4-python3-runtime==4.9.3
29
+ anyio==4.10.0
30
+ array_record==0.8.1
31
+ async-timeout==5.0.1
32
+ attrs==25.3.0
33
+ av==15.1.0
34
+ backports.tarfile==1.2.0
35
+ beaker-gantry==3.2.0
36
+ beaker-py==2.5.0
37
+ black==23.12.1
38
+ blinker==1.9.0
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ et_xmlfile==2.0.0
72
+ etils==1.13.0
73
+ evdev==1.9.2
74
+ exceptiongroup==1.3.0
75
+ face==24.0.0
76
+ fastapi==0.116.2
77
+ ffmpy==0.6.1
78
+ fiddle==0.3.0
79
+ filelock==3.13.1
80
+ Flask==3.1.2
81
+ fonttools==4.60.0
82
+ frozenlist==1.7.0
83
+ fsspec==2023.9.2
84
+ ftfy==6.3.1
85
+ gcsfs==2023.9.2
86
+ gitdb==4.0.12
87
+ GitPython==3.1.45
88
+ glom==24.11.0
89
+ google-api-core==2.25.1
90
+ google-auth==2.40.3
91
+ google-auth-oauthlib==1.2.2
92
+ google-cloud-core==2.4.3
93
+ google-cloud-storage==2.19.0
94
+ google-crc32c==1.7.1
95
+ google-resumable-media==2.7.2
96
+ googleapis-common-protos==1.70.0
97
+ gradio==5.46.0
98
+ gradio_client==1.13.0
99
+ graphviz==0.21
100
+ groovy==0.1.2
101
+ grpcio==1.75.0
102
+ gymnasium==0.29.1
103
+ h11==0.16.0
104
+ hf_transfer==0.1.9
105
+ hf-xet==1.1.10
106
+ httpcore==1.0.9
107
+ httpx==0.28.1
108
+ huggingface-hub==0.35.0
109
+ id==1.5.0
110
+ idna==3.10
111
+ imageio==2.37.0
112
+ imageio-ffmpeg==0.6.0
113
+ importlib_metadata==8.7.0
114
+ importlib_resources==6.5.2
115
+ iniconfig==2.1.0
116
+ inquirerpy==0.3.4
117
+ isort==5.12.0
118
+ itsdangerous==2.2.0
119
+ jaraco.classes==3.4.0
120
+ jaraco.context==6.0.1
121
+ jaraco.functools==4.3.0
122
+ jeepney==0.9.0
123
+ Jinja2==3.1.4
124
+ jiter==0.11.0
125
+ jmespath==1.0.1
126
+ joblib==1.5.2
127
+ jsonlines==4.0.0
128
+ keras==2.15.0
129
+ keyring==25.6.0
130
+ kiwisolver==1.4.9
131
+ latex2sympy2_extended==1.10.2
132
+ lerobot==0.3.4
133
+ Levenshtein==0.27.1
134
+ libcst==1.8.4
135
+ lightning-utilities==0.15.2
136
+ markdown-it-py==4.0.0
137
+ math-verify==0.8.0
138
+ matplotlib==3.10.6
139
+ mdurl==0.1.2
140
+ mergedeep==1.3.4
141
+ ml-dtypes==0.2.0
142
+ ml_dtypes==0.5.3
143
+ more-itertools==10.8.0
144
+ mpmath==1.3.0
145
+ msgspec==0.19.0
146
+ multidict==6.6.4
147
+ multiprocess==0.70.16
148
+ mypy==1.3.0
149
+ mypy_extensions==1.1.0
150
+ necessary==0.4.3
151
+ networkx==3.3
152
+ nh3==0.3.0
153
+ nltk==3.9.1
154
+ numpy==1.26.4
155
+ oauthlib==3.3.1
156
+ omegaconf==2.3.0
157
+ openai==1.108.0
158
+ opencv-python-headless==4.12.0.88
159
+ OpenEXR==3.4.0
160
+ openpyxl==3.1.5
161
+ orderly-set==5.5.0
162
+ orjson==3.11.3
163
+ packaging==25.0
164
+ pandas==2.3.2
165
+ pathspec==0.12.1
166
+ petname==2.6
167
+ pfzy==0.3.4
168
+ pillow==11.0.0
169
+ pip==25.2
170
+ platformdirs==4.4.0
171
+ pluggy==1.6.0
172
+ promise==2.3
173
+ prompt_toolkit==3.0.52
174
+ propcache==0.3.2
175
+ proto-plus==1.26.1
176
+ protobuf==4.21.12
177
+ protobuf==6.32.1
178
+ psutil==7.1.0
179
+ pyarrow==21.0.0
180
+ pyasn1==0.6.1
181
+ pyasn1_modules==0.4.2
182
+ pycparser==2.23
183
+ pydantic==2.11.9
184
+ pydantic_core==2.33.2
185
+ pydub==0.25.1
186
+ Pygments==2.19.2
187
+ pynput==1.8.1
188
+ pyparsing==3.2.4
189
+ pyproject_hooks==1.2.0
190
+ pyserial==3.5
191
+ pytest==8.4.2
192
+ pytest-sphinx==0.6.3
193
+ python-dateutil==2.9.0.post0
194
+ python-Levenshtein==0.27.1
195
+ python-multipart==0.0.20
196
+ python-xlib==0.33
197
+ pytorch-triton-rocm==3.4.0
198
+ pytz==2025.2
199
+ pyyaml-include==1.4.1
200
+ RapidFuzz==3.14.1
201
+ readme_renderer==44.0
202
+ regex==2025.9.1
203
+ requests==2.32.5
204
+ requests-oauthlib==2.0.0
205
+ requests-toolbelt==1.0.0
206
+ requirements-parser==0.13.0
207
+ rerun-sdk==0.22.1
208
+ rfc3986==2.0.0
209
+ rich==13.9.4
210
+ rsa==4.9.1
211
+ ruff==0.13.0
212
+ s3transfer==0.14.0
213
+ safehttpx==0.1.6
214
+ safetensors==0.6.2
215
+ scikit-learn==1.7.2
216
+ scipy==1.15.3
217
+ SecretStorage==3.4.0
218
+ semantic-version==2.10.0
219
+ sentencepiece==0.2.1
220
+ sentry-sdk==2.38.0
221
+ setuptools==78.1.1
222
+ shellingham==1.5.4
223
+ six==1.17.0
224
+ smart_open==7.3.1
225
+ smashed==0.21.5
226
+ smmap==5.0.2
227
+ sniffio==1.3.1
228
+ starlette==0.48.0
229
+ sympy==1.13.3
230
+ tensorboard==2.15.2
231
+ tensorboard==2.19.0
232
+ tensorflow==2.15.0
233
+ tensorflow-addons==0.23.0
234
+ tensorflow-datasets==4.9.3
235
+ tensorflow-estimator==2.15.0
236
+ tensorflow-graphics==2021.12.3
237
+ tensorflow-metadata==1.17.2
238
+ threadpoolctl==3.6.0
239
+ timm==1.0.19
240
+ tokenizers==0.22.0
241
+ toml==0.10.2
242
+ tomli==2.2.1
243
+ tomlkit==0.13.3
244
+ torch==2.8.0+rocm6.4
245
+ torchcodec==0.5
246
+ torchmetrics==1.8.2
247
+ torchvision==0.23.0+rocm6.4
248
+ tqdm==4.67.1
249
+ transformers==4.56.1
250
+ trimesh==4.8.2
251
+ trouting==0.3.3
252
+ twine==6.2.0
253
+ typeguard==2.13.3
254
+ typer==0.17.4
255
+ typing_extensions==4.15.0
256
+ typing-inspect==0.9.0
257
+ typing-inspection==0.4.1
258
+ tzdata==2025.2
259
+ urllib3==2.5.0
260
+ uvicorn==0.35.0
261
+ wandb==0.21.4
262
+ wcwidth==0.2.13
263
+ websockets==15.0.1
264
+ wheel==0.45.1
265
+ wrapt==1.14.2
266
+ xxhash==3.5.0
267
+ yarl==1.20.1
268
+ zipp==3.23.0
269
+ lerobot==0.3.4
270
+ minLoRA==0.1.0
271
+ autocommand==2.2.2
272
+ backports.tarfile==1.2.0
273
+ importlib_metadata==8.0.0
274
+ inflect==7.3.1
275
+ jaraco.collections==5.1.0
276
+ jaraco.context==5.3.0
277
+ jaraco.functools==4.0.1
278
+ jaraco.text==3.12.1
279
+ more-itertools==10.3.0
280
+ packaging==24.2
281
+ platformdirs==4.2.2
282
+ tomli==2.0.1
283
+ typeguard==4.3.0
284
+ typing_extensions==4.12.2
285
+ wheel==0.45.1
286
+ zipp==3.19.2
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-10-08T16:38:34.545687Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_flow_matching",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "1600",
14
+ "--ft_llm",
15
+ "--checkpoint",
16
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
17
+ "--device_train_microbatch_size",
18
+ "16",
19
+ "--global_batch_size",
20
+ "126",
21
+ "--dataset",
22
+ "vla_dataset_realworld",
23
+ "--llm_learning_rate",
24
+ "5e-5",
25
+ "--wandb_entity",
26
+ "henryeap",
27
+ "--wandb_project",
28
+ "a1-realworld",
29
+ "--wandb_run_name",
30
+ "cleandesk50",
31
+ "--real_world_vla_config_path",
32
+ "vla_config_realworld/vla_config_cleandesk50.yaml",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "49712a42d21a8c739a16ba5eeaec4a0d7b29ab80"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_flow_matching/wandb",
44
+ "host": "auh7-1b-gpu-319",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "50988601344"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606960640"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "1",
62
+ "uniqueId": "0x75d378aea8d8934d",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "4",
75
+ "uniqueId": "0x328cfe1d1a9d2b38",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "6",
88
+ "uniqueId": "0x3c4f0005790d7da3",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "0",
101
+ "uniqueId": "0x763c831cad37d9b",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "3",
114
+ "uniqueId": "0x697c203d8e63f05b",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "7",
127
+ "uniqueId": "0x91078b09ae9b0757",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "2",
140
+ "uniqueId": "0x2433899c197738b6",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "5",
153
+ "uniqueId": "0x2bc0f4cfe424c12a",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1760200645",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "2283",
177
+ "job_name": "mh_cleandesk50_flow_matching",
178
+ "job_nodelist": "auh7-1b-gpu-319",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1759941445",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "2283",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-319",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "152784",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-319",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "yfehyqgufhcu23nx6cclbs2foj5p3ccp"
204
+ }
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_timestamp":1.7599418275553412e+09,"_wandb":{"runtime":323},"_runtime":323.700364245,"_step":0,"System/Peak GPU Memory (MB)":36856.41796875}
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:34.695045626Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp3fc8cjxs/port-152974.txt","pid":152974,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-10-08T16:38:34.696250464Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":152974}
3
+ {"time":"2025-10-08T16:38:34.696200463Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-152974-153157-1697716542/socket","Net":"unix"}}
4
+ {"time":"2025-10-08T16:38:34.795279742Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-10-08T16:38:34.805004585Z","level":"INFO","msg":"handleInformInit: received","streamId":"quokv8gn","id":"1(@)"}
6
+ {"time":"2025-10-08T16:38:35.952704891Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"quokv8gn","id":"1(@)"}
7
+ {"time":"2025-10-08T16:44:01.423240817Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"quokv8gn","id":"1(@)"}
8
+ {"time":"2025-10-08T16:44:01.426683567Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"quokv8gn","id":"1(@)"}
9
+ {"time":"2025-10-08T16:44:01.48018829Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-10-08T16:44:01.48022619Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-10-08T16:44:01.48023591Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
12
+ {"time":"2025-10-08T16:44:01.480247021Z","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2025-10-08T16:44:01.480274301Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-10-08T16:44:01.480277991Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-10-08T16:44:01.480326482Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-152974-153157-1697716542/socket","Net":"unix"}}
16
+ {"time":"2025-10-08T16:44:01.480358322Z","level":"INFO","msg":"server is closed"}
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:34.806823131Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:35.95264112Z","level":"INFO","msg":"stream: created new stream","id":"quokv8gn"}
3
+ {"time":"2025-10-08T16:38:35.952698801Z","level":"INFO","msg":"stream: started","id":"quokv8gn"}
4
+ {"time":"2025-10-08T16:38:35.952731371Z","level":"INFO","msg":"sender: started","stream_id":"quokv8gn"}
5
+ {"time":"2025-10-08T16:38:35.952734591Z","level":"INFO","msg":"writer: started","stream_id":"quokv8gn"}
6
+ {"time":"2025-10-08T16:38:35.952725981Z","level":"INFO","msg":"handler: started","stream_id":"quokv8gn"}
7
+ {"time":"2025-10-08T16:43:59.99384105Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.001054985}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:44:01.036808965Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:01.423613682Z","level":"INFO","msg":"stream: closing","id":"quokv8gn"}
10
+ {"time":"2025-10-08T16:44:01.423631393Z","level":"INFO","msg":"handler: closed","stream_id":"quokv8gn"}
11
+ {"time":"2025-10-08T16:44:01.425219996Z","level":"INFO","msg":"sender: closed","stream_id":"quokv8gn"}
12
+ {"time":"2025-10-08T16:44:01.425239166Z","level":"INFO","msg":"stream: closed","id":"quokv8gn"}
cleandesk50_flow_matching/wandb/wandb/run-20251008_163834-quokv8gn/logs/debug.log ADDED
File without changes
cleandesk50_l1_regression/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:31.726288089Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:33.081399086Z","level":"INFO","msg":"stream: created new stream","id":"fqdwkc8m"}
3
+ {"time":"2025-10-08T16:38:33.081437966Z","level":"INFO","msg":"stream: started","id":"fqdwkc8m"}
4
+ {"time":"2025-10-08T16:38:33.081464946Z","level":"INFO","msg":"sender: started","stream_id":"fqdwkc8m"}
5
+ {"time":"2025-10-08T16:38:33.081464936Z","level":"INFO","msg":"writer: started","stream_id":"fqdwkc8m"}
6
+ {"time":"2025-10-08T16:38:33.081488306Z","level":"INFO","msg":"handler: started","stream_id":"fqdwkc8m"}
7
+ {"time":"2025-10-08T16:44:35.233389442Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.00059375}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:44:37.058187164Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:37.412033949Z","level":"INFO","msg":"stream: closing","id":"fqdwkc8m"}
10
+ {"time":"2025-10-08T16:44:37.412048989Z","level":"INFO","msg":"handler: closed","stream_id":"fqdwkc8m"}
11
+ {"time":"2025-10-08T16:44:37.4130748Z","level":"INFO","msg":"sender: closed","stream_id":"fqdwkc8m"}
12
+ {"time":"2025-10-08T16:44:37.41308173Z","level":"INFO","msg":"stream: closed","id":"fqdwkc8m"}
cleandesk50_l1_regression/wandb/wandb/debug.log ADDED
File without changes
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/config.yaml ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ da16lrefa3ue6fcq4audbkzfio2vskvf:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_l1_regression
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - l1_regression
13
+ - --seq_len
14
+ - "1600"
15
+ - --ft_llm
16
+ - --checkpoint
17
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
18
+ - --device_train_microbatch_size
19
+ - "16"
20
+ - --global_batch_size
21
+ - "126"
22
+ - --dataset
23
+ - vla_dataset_realworld
24
+ - --llm_learning_rate
25
+ - "5e-5"
26
+ - --wandb_entity
27
+ - henryeap
28
+ - --wandb_project
29
+ - a1-realworld
30
+ - --wandb_run_name
31
+ - cleandesk50
32
+ - --real_world_vla_config_path
33
+ - vla_config_realworld/vla_config_cleandesk50.yaml
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "52340371456"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: 49712a42d21a8c739a16ba5eeaec4a0d7b29ab80
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "7"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x21a2e88d06c419dc"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "4"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0xa515afd8ced1d39d"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "2"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x399226d2b2bfa544"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "6"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0xfa8b85a4625b04f"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "3"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0xf61ec17df11883bd"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "1"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0x9b5c1c302c8129f8"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "0"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0x3558c3014c813fdb"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "5"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0x137c9ede1bb1518e"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-188
140
+ memory:
141
+ total: "2434606964736"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_l1_regression/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1760200645"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "2284"
158
+ job_name: mh_cleandesk50_l1_regression
159
+ job_nodelist: auh7-1b-gpu-188
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1759941445"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "2284"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-188
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "2621518"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-188
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-10-08T16:38:31.458924Z"
184
+ writerId: da16lrefa3ue6fcq4audbkzfio2vskvf
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 2
191
+ - 3
192
+ - 5
193
+ - 11
194
+ - 41
195
+ - 49
196
+ - 51
197
+ - 53
198
+ - 63
199
+ - 71
200
+ - 83
201
+ - 95
202
+ - 105
203
+ "2":
204
+ - 1
205
+ - 2
206
+ - 3
207
+ - 5
208
+ - 11
209
+ - 41
210
+ - 49
211
+ - 51
212
+ - 53
213
+ - 63
214
+ - 71
215
+ - 83
216
+ - 95
217
+ - 105
218
+ "3":
219
+ - 2
220
+ - 13
221
+ - 15
222
+ - 16
223
+ - 61
224
+ "4": 3.10.18
225
+ "5": 0.21.4
226
+ "6": 4.56.1
227
+ "10":
228
+ - 19
229
+ "12": 0.21.4
230
+ "13": linux-x86_64
231
+ activation_checkpointing:
232
+ value: whole_layer
233
+ allow_resume:
234
+ value: false
235
+ batch_divisor:
236
+ value: global_batch
237
+ canceled_check_interval:
238
+ value: 50
239
+ checkpoint_dir:
240
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
241
+ compile:
242
+ value: null
243
+ console_log_interval:
244
+ value: 1
245
+ data:
246
+ value:
247
+ dataset: vla_dataset_realworld
248
+ drop_last: true
249
+ for_inference: false
250
+ lerobot_episode_index_end: null
251
+ lerobot_episode_index_start: null
252
+ mixture: null
253
+ multi_modal: torch
254
+ num_workers: 0
255
+ pad: to_max
256
+ persistent_workers: false
257
+ pin_memory: true
258
+ prefetch_factor: null
259
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
260
+ rlds_dataset_name: libero_4_task_suites_no_noops
261
+ rlds_read_threads: 8
262
+ rlds_shuffle_buffer_size: 100000
263
+ rlds_traj_threads: 8
264
+ root_size_mixture: null
265
+ seed: 95818
266
+ sequence_length: 1600
267
+ shuffle: true
268
+ shuffle_messages: false
269
+ split: train
270
+ timeout: 0
271
+ use_proprio: true
272
+ use_wrist_image: true
273
+ device_eval_batch_size:
274
+ value: 4
275
+ device_inf_eval_batch_size:
276
+ value: 16
277
+ device_train_batch_size:
278
+ value: 15
279
+ device_train_grad_accum:
280
+ value: 0
281
+ device_train_microbatch_size:
282
+ value: 16
283
+ dry_run:
284
+ value: false
285
+ early_exit:
286
+ value: false
287
+ epoch:
288
+ value: null
289
+ eval_interval:
290
+ value: 0
291
+ eval_on_load:
292
+ value: false
293
+ eval_subset_num_batches:
294
+ value: -1
295
+ evaluators:
296
+ value:
297
+ - data:
298
+ dataset: vla_dataset_realworld
299
+ drop_last: true
300
+ for_inference: false
301
+ lerobot_episode_index_end: 765
302
+ lerobot_episode_index_start: 353
303
+ mixture: null
304
+ multi_modal: torch
305
+ num_workers: 0
306
+ pad: to_max
307
+ persistent_workers: true
308
+ pin_memory: true
309
+ prefetch_factor: null
310
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
311
+ rlds_dataset_name: libero_4_task_suites_no_noops
312
+ rlds_read_threads: 8
313
+ rlds_shuffle_buffer_size: 256000
314
+ rlds_traj_threads: 8
315
+ root_size_mixture: null
316
+ seed: null
317
+ sequence_length: 1600
318
+ shuffle: false
319
+ shuffle_messages: false
320
+ split: validation
321
+ timeout: 0
322
+ use_proprio: true
323
+ use_wrist_image: true
324
+ device_eval_batch_size: null
325
+ eval_name: null
326
+ label: val
327
+ max_examples: null
328
+ max_new_tokens: 448
329
+ mm_evaluator: null
330
+ save_dir: null
331
+ save_to_checkpoint_dir: false
332
+ skip_if_metrics_cached: true
333
+ subset_num_batches: 64
334
+ extra_steps_after_cancel:
335
+ value: 10
336
+ fast_forward_batches:
337
+ value: null
338
+ force_save_unsharded:
339
+ value: false
340
+ fsdp:
341
+ value:
342
+ hybrid_sharding_num_model_replicas: null
343
+ precision: float
344
+ sharding_strategy: FULL_SHARD
345
+ use_orig_params: true
346
+ wrapping_strategy: by_block_and_size
347
+ ft_connector:
348
+ value: false
349
+ ft_embedding:
350
+ value: lm_head
351
+ ft_llm:
352
+ value: true
353
+ ft_vit:
354
+ value: false
355
+ fused_loss:
356
+ value: null
357
+ gen1_gc_interval:
358
+ value: 1
359
+ global_train_batch_size:
360
+ value: 126
361
+ inf_eval_interval:
362
+ value: -1
363
+ inf_eval_subset_num_batches:
364
+ value: -1
365
+ inf_evaluators:
366
+ value: []
367
+ initial_model_checkpoint:
368
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
369
+ keep_lr_on_load:
370
+ value: true
371
+ load_model_config:
372
+ value: null
373
+ load_path:
374
+ value: null
375
+ load_path_sharded_checkpointer:
376
+ value: null
377
+ lora:
378
+ value: false
379
+ lora_connector:
380
+ value: false
381
+ lora_llm:
382
+ value: false
383
+ lora_rank:
384
+ value: 8
385
+ lora_vit:
386
+ value: false
387
+ max_duration:
388
+ value: 500000
389
+ max_grad_norm:
390
+ value: 1
391
+ max_grad_norm_ratio:
392
+ value: null
393
+ model:
394
+ value:
395
+ action_dim: 7
396
+ action_head: l1_regression
397
+ action_head_dit_depth: 28
398
+ action_head_dit_hidden_size: 1152
399
+ action_head_dit_num_heads: 16
400
+ action_tokenizer:
401
+ identifier: physical-intelligence/fast
402
+ tokenizer_dir: null
403
+ action_use_left_eef: true
404
+ action_use_mobile_base: false
405
+ activation_type: swiglu
406
+ additional_vocab_size: 128
407
+ always_start_with_space: true
408
+ attention_dropout: 0
409
+ attention_layer_norm: false
410
+ attention_layer_norm_with_affine: true
411
+ attention_type: sdpa
412
+ bias_for_layer_norm: null
413
+ block_group_size: 1
414
+ block_type: sequential
415
+ clip_qkv: null
416
+ crop_mode: overlap-and-resize-c2
417
+ d_model: 3584
418
+ default_inference_len: 65
419
+ embedding_dropout: 0
420
+ embedding_size: 152064
421
+ ff_out_size: null
422
+ fix_image_padding: true
423
+ float32_attention: true
424
+ head_dim: null
425
+ horizon: 8
426
+ image_feature_dropout: 0
427
+ image_padding_embed: pad_and_partial_pad
428
+ image_pooling_2d: attention_meanq
429
+ image_pooling_h: 2
430
+ image_pooling_w: 2
431
+ image_projector: mlp
432
+ include_bias: false
433
+ init_cutoff_factor: null
434
+ init_device: null
435
+ init_fn: normal
436
+ init_std: 0.02
437
+ initializer_range: 0.02
438
+ layer_norm_eps: 1e-06
439
+ layer_norm_type: rms
440
+ layer_norm_with_affine: true
441
+ llm_causal_attention: false
442
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
443
+ low_cpu_fsdp: true
444
+ max_crops: 12
445
+ max_position_embeddings: null
446
+ max_sequence_length: 4096
447
+ message_formatting: role
448
+ mlp_hidden_size: 37888
449
+ mlp_ratio: 4
450
+ moe_capacity_factor: 1.25
451
+ moe_dropless: true
452
+ moe_interleave: false
453
+ moe_lbl_in_fp32: false
454
+ moe_log_expert_assignment: false
455
+ moe_loss_weight: 0.1
456
+ moe_mlp_impl: sparse
457
+ moe_num_experts: 8
458
+ moe_shared_expert: false
459
+ moe_top_k: 2
460
+ moe_zloss_weight: null
461
+ multi_annotation_weighting: root_subsegments
462
+ n_heads: 28
463
+ n_kv_heads: 4
464
+ n_layers: 28
465
+ new_embedding_init_range: 0.02
466
+ norm_after: false
467
+ normalize_input_embeds: false
468
+ num_diffusion_inference_steps: 30
469
+ num_diffusion_steps: 1000
470
+ overlap_margins:
471
+ - 4
472
+ - 4
473
+ pad_tokenizer: true
474
+ pad_value: 0
475
+ precision: amp_bf16
476
+ prompt_type: uber_model
477
+ qkv_bias: true
478
+ residual_dropout: 0.1
479
+ response_residual_dropout: 0
480
+ rope: true
481
+ rope_full_precision: true
482
+ rope_theta: 1e+06
483
+ scale_logits: false
484
+ system_prompt_kind: demo_or_style
485
+ tokenizer:
486
+ identifier: Qwen/Qwen2-7B
487
+ tokenizer_dir: null
488
+ use_col_tokens: true
489
+ use_position_ids: true
490
+ use_proprio: true
491
+ vision_backbone:
492
+ attention_dropout: 0
493
+ fsdp_wrap: false
494
+ image_default_input_size:
495
+ - 336
496
+ - 336
497
+ image_dropout_rate: 0
498
+ image_emb_dim: 1024
499
+ image_head_dim: 64
500
+ image_mlp_activations: quick_gelu
501
+ image_mlp_dim: 4096
502
+ image_model_type: openai
503
+ image_norm_eps: 1e-05
504
+ image_num_heads: 16
505
+ image_num_key_value_heads: 16
506
+ image_num_layers: 23
507
+ image_num_pos: 577
508
+ image_patch_size: 14
509
+ image_pos_patch_size: 14
510
+ initializer_range: 0.02
511
+ residual_dropout: 0
512
+ resize_mode: default
513
+ vit_layers:
514
+ - -2
515
+ - -9
516
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
517
+ vocab_size: 152064
518
+ weight_tying: false
519
+ multi_component_grad_norm:
520
+ value: true
521
+ no_pre_train_checkpoint:
522
+ value: true
523
+ optimizer:
524
+ value:
525
+ betas:
526
+ - 0.9
527
+ - 0.95
528
+ connector_betas:
529
+ - 0.9
530
+ - 0.95
531
+ connector_eps: 1e-06
532
+ connector_learning_rate: 0.0002
533
+ connector_weight_decay: 0
534
+ eps: 1e-05
535
+ learning_rate: 0.0001
536
+ llm_betas:
537
+ - 0.9
538
+ - 0.95
539
+ llm_eps: 1e-06
540
+ llm_learning_rate: 5e-05
541
+ llm_weight_decay: 0
542
+ metrics_log_interval: 20
543
+ name: adamw
544
+ vit_betas:
545
+ - 0.9
546
+ - 0.95
547
+ vit_eps: 1e-06
548
+ vit_learning_rate: 6e-06
549
+ vit_weight_decay: 0
550
+ weight_decay: 0.01
551
+ precision:
552
+ value: amp_bf16
553
+ python_profiling:
554
+ value: false
555
+ remote_save_folder:
556
+ value: null
557
+ reset_dataloader_state:
558
+ value: false
559
+ reset_optimizer_state:
560
+ value: false
561
+ reset_trainer_state:
562
+ value: false
563
+ restore_dataloader:
564
+ value: true
565
+ run_name:
566
+ value: cleandesk50_20251008_163748
567
+ save_dataloader_state:
568
+ value: false
569
+ save_folder:
570
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_l1_regression
571
+ save_interval:
572
+ value: 500
573
+ save_interval_action_head:
574
+ value: 500
575
+ save_interval_ephemeral:
576
+ value: null
577
+ save_interval_unsharded:
578
+ value: 500
579
+ save_num_action_head_checkpoints_to_keep:
580
+ value: 2
581
+ save_num_checkpoints_to_keep:
582
+ value: 1
583
+ save_num_unsharded_checkpoints_to_keep:
584
+ value: 1
585
+ save_overwrite:
586
+ value: true
587
+ scheduler:
588
+ value:
589
+ alpha_f: 0.1
590
+ connector_t_warmup: 200
591
+ grad_clip_warmup_factor: null
592
+ grad_clip_warmup_steps: null
593
+ llm_t_warmup: 2000
594
+ name: multimodal
595
+ t_max: null
596
+ t_warmup: 100
597
+ units: steps
598
+ vit_t_warmup: 2000
599
+ warmup_min_lr: 0
600
+ seed:
601
+ value: 6198
602
+ sharded_checkpointer:
603
+ value: torch_legacy
604
+ softmax_auxiliary_loss:
605
+ value: true
606
+ softmax_auxiliary_loss_scale:
607
+ value: 0.0001
608
+ speed_monitor:
609
+ value:
610
+ gpu_flops_available: null
611
+ window_size: 20
612
+ stop_after:
613
+ value: null
614
+ stop_at:
615
+ value: 500000
616
+ time_limit:
617
+ value: null
618
+ torch_profiling:
619
+ value: false
620
+ train_exit_random_layer:
621
+ value: false
622
+ use_lora:
623
+ value: false
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/output.log ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 10/08 [16:38:33] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk50', 8, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 8, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 8, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 10/08 [16:38:35] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:130
13
+ 10/08 [16:38:36] INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:436
14
+ ****** before LeRobot dataset...
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk50
16
+ ****** length of the dataset: 27906
17
+ 10/08 [16:38:38] INFO | >> build_rlds_train_dataset: Loading train dataset: vla_dataset_realworld/train __init__.py:519
18
+ ****** Import RLDSBatchTransform, RLDSDataset successfully.
19
+ ****** before RLDS dataset...
20
+ ****** data_config.rlds_dataset_name: a1_real_world
21
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/zhangjian/datasets/OXE
22
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f46144c0>
23
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
24
+ 100%|██████████| 87212/87212 [00:44<00:00, 1952.50it/s]
25
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f46cf3d0>
26
+ 10/08 [16:39:27] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
27
+ /vast/users/xiaodan/zhangjian/datasets/OXE/jaco_play/0.1.0/dataset_statistics_e081d4716a3da95df91c79d661ae59fa26a43da49db4bf8d716b622b56
28
+ 3b0ea3.json.
29
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f483f160>
30
+ 10/08 [16:39:28] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
31
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_cable_routing/0.1.0/dataset_statistics_08cb4c5b7c5e6c035fc84ea85b2d54c0c46ad608a8763
32
+ 4ebb18374088d23cd76.json.
33
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c6674130>
34
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
35
+ /vast/users/xiaodan/zhangjian/datasets/OXE/viola/0.1.0/dataset_statistics_2415d8f7de73c8761fedd7c2a9590667fb0d3fdd26664bf4c100222e5cdb89
36
+ b9.json.
37
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f46239d0>
38
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
39
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_autolab_ur5/0.1.0/dataset_statistics_1b798b015e7b2c4e4396719e3aa4d43a2f400b2edf5dbb0
40
+ 820cb3df6943d8ddc.json.
41
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c6f6e2c0>
42
+ 10/08 [16:39:29] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
43
+ /vast/users/xiaodan/zhangjian/datasets/OXE/austin_buds_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_ccecde24cc01793b221
44
+ 4eb0c4c5d7cc0e3ccc623db99bd892b83552b20decfb7.json.
45
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c6f4e620>
46
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
47
+ 100%|██████████| 456/456 [00:25<00:00, 17.70it/s]
48
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cdaae350>
49
+ 10/08 [16:39:57] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
50
+ 100%|██████████| 5100/5100 [01:00<00:00, 84.00it/s]
51
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcaff3d0>
52
+ 10/08 [16:41:03] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
53
+ 100%|██████████| 240/240 [00:05<00:00, 46.96it/s]
54
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dc35bdc0>
55
+ 10/08 [16:41:09] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
56
+ /vast/users/xiaodan/zhangjian/datasets/OXE/austin_sirius_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_cb2e0273f80029a19
57
+ dc3dbb3a3a4118a5598e7bff3ff0245891255825b04b42a.json.
58
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcb6e380>
59
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
60
+ /vast/users/xiaodan/zhangjian/datasets/OXE/dlr_edan_shared_control_converted_externally_to_rlds/0.1.0/dataset_statistics_b8984563fc3e7ea
61
+ c0803c667ef58c9deaf2e747683568306ea1d83505d532a76.json.
62
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcb83eb0>
63
+ 10/08 [16:41:10] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
64
+ 100%|██████████| 1500/1500 [00:02<00:00, 544.82it/s]
65
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd485600>
66
+ 10/08 [16:41:14] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
67
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_fanuc_manipulation/0.1.0/dataset_statistics_a98d349d0364668095ea3ca38c6785e94f35e5e5
68
+ 8e234c88fac83775a923b0d0.json.
69
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c6f6e560>
70
+ 10/08 [16:41:15] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
71
+ 100%|██████████| 43264/43264 [00:51<00:00, 836.91it/s]
72
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c66743d0>
73
+ 10/08 [16:42:10] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
74
+ /vast/users/xiaodan/zhangjian/datasets/OXE/roboturk/0.1.0/dataset_statistics_3aa821e17a2937f941d4102cfadcb1154853cb45dcec07ccc66893b01f6
75
+ f1b40.json.
76
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd4037c0>
77
+ 10/08 [16:42:11] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
78
+ /vast/users/xiaodan/zhangjian/datasets/OXE/toto/0.1.0/dataset_statistics_505a51eb76e85fe0969e8e70e45fb8c9ae5d3b1fae2851c7899bea91f74b979
79
+ 0.json.
80
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0e40fda20>
81
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
82
+ /vast/users/xiaodan/zhangjian/datasets/OXE/ucsd_kitchen_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_1f1a5f310a2d5a6edc
83
+ 0e217370e135c8c8598290f11f57025037adcb0d033926.json.
84
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f55c19c0>
85
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
86
+ /vast/users/xiaodan/zhangjian/datasets/OXE/iamlab_cmu_pickup_insert_converted_externally_to_rlds/0.1.0/dataset_statistics_698a1f892f8866
87
+ af9cb4bd5a23611fa44d8c7d9d816f9b3049d2fc3b62442079.json.
88
+
89
+ ######################################################################################
90
+ # Loading the following 18 datasets (incl. sampling weight): #
91
+ # fractal20220817_data: ====================================================0.529250 #
92
+ # jaco_play: ===============================================================0.010898 #
93
+ # berkeley_cable_routing: ==================================================0.005916 #
94
+ # viola: ===================================================================0.021337 #
95
+ # berkeley_autolab_ur5: ====================================================0.027379 #
96
+ # austin_buds_dataset_converted_externally_to_rlds: ========================0.004768 #
97
+ # nyu_franka_play_dataset_converted_externally_to_rlds: ====================0.018817 #
98
+ # furniture_bench_dataset_converted_externally_to_rlds: ====================0.055185 #
99
+ # austin_sailor_dataset_converted_externally_to_rlds: ======================0.049354 #
100
+ # austin_sirius_dataset_converted_externally_to_rlds: ======================0.039129 #
101
+ # dlr_edan_shared_control_converted_externally_to_rlds: ====================0.001248 #
102
+ # utaustin_mutex: ==========================================================0.050583 #
103
+ # berkeley_fanuc_manipulation: =============================================0.017504 #
104
+ # bc_z: ====================================================================0.168166 #
105
+ # roboturk: ================================================================0.000131 #
106
+ # toto: ====================================================================0.000228 #
107
+ # ucsd_kitchen_dataset_converted_externally_to_rlds: =======================0.000006 #
108
+ # iamlab_cmu_pickup_insert_converted_externally_to_rlds: ===================0.000102 #
109
+ ######################################################################################
110
+
111
+ INFO | >> [*] Threads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:563
112
+ INFO | >> [*] Reads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:564
113
+ INFO | >> [*] Constructing datasets... dataset.py:567
114
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f4855030>
115
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd54ed40>
116
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0c6c9ef50>
117
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cadc44c0>
118
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0d57f5210>
119
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dc535f90>
120
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0ec109ff0>
121
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0e41cfbe0>
122
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f4965120>
123
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd403e80>
124
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dc335330>
125
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0f46cde10>
126
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd486440>
127
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcb6ffd0>
128
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcad3340>
129
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0dcbbba60>
130
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0cd0179d0>
131
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7fb0caf39840>
132
+ 10/08 [16:42:16] INFO | >> [*] Applying frame transforms on dataset... dataset.py:607
133
+ ****** after RLDSDataset initialization!
134
+ ****** length of the dataset: 7154275
135
+ ****** Build rlds train dataset: IterableDatasetWrapper successfully.
136
+ ****** path: None
137
+ ****** Skip AgiBotWorld-Alpha open-source-real-world; path not found: None
138
+ ****** After build vla train dataset...
139
+ ****** iterable_sources: [<olmo.data.dataset.IterableDatasetWrapper object at 0x7fb0f4675060>, <olmo.data.dataset.IterableDatasetWrapper object at 0x7fb0f4615db0>]
140
+ ****** Before build mixed iterable dataset...
141
+ ****** Build vla train dataloader successfully!
142
+ ************************* Build train_dataloader successful!
143
+ ************************* Before build_inf_evaluators
144
+ 10/08 [16:42:17] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
145
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
146
+ warnings.warn( # warn only once
147
+
148
+ ************************* Build evaluators successful!
149
+ ************************* Early exit flags: early_exit=False
150
+ PROPRIO_DIM 16 does not match ACTION_DIM 16 for AffordVLA
151
+ ************************* Initialize model successful!
152
+ ***** state_dict_path: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924/model.pt
153
+ ***** Load checkpoint successful!
154
+ missing keys: ['action_head.model.layer_norm1.weight', 'action_head.model.layer_norm1.bias', 'action_head.model.fc1.weight', 'action_head.model.fc1.bias', 'action_head.model.mlp_resnet_blocks.0.ffn.0.weight', 'action_head.model.mlp_resnet_blocks.0.ffn.0.bias', 'action_head.model.mlp_resnet_blocks.0.ffn.1.weight', 'action_head.model.mlp_resnet_blocks.0.ffn.1.bias', 'action_head.model.mlp_resnet_blocks.1.ffn.0.weight', 'action_head.model.mlp_resnet_blocks.1.ffn.0.bias', 'action_head.model.mlp_resnet_blocks.1.ffn.1.weight', 'action_head.model.mlp_resnet_blocks.1.ffn.1.bias', 'action_head.model.layer_norm2.weight', 'action_head.model.layer_norm2.bias', 'action_head.model.fc2.weight', 'action_head.model.fc2.bias', 'proprio_projector.fc1.weight', 'proprio_projector.fc1.bias', 'proprio_projector.fc2.weight', 'proprio_projector.fc2.bias']
155
+ unexpected keys: []
156
+ ************************* Initialize model successful!
157
+ ************************* Before FSDP model wrapping
158
+ ************************* FSDP model wrapping successful!
159
+ ************************* Before building optimizer and scheduler
160
+ 10/08 [16:44:19] INFO | >> Constructing optimizer with 2 param groups optim.py:1283
161
+ **************************************************
162
+ After building optimizer and scheduler and model, before training, peak GPU memory (MB): 35614
163
+ ************************* VLATrainer initialized successfully!
164
+ ************************* Before trainer.fit()
165
+ Pre-train system metrics
166
+ System/Peak GPU Memory (MB)=35,614
167
+ 10/08 [16:44:31] WARNING | >> /vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py:200: UserWarning: To copy construct from a tensor, it is recommended to use warnings.py:109
168
+ sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
169
+ timestep_list = [torch.tensor(ex["timestep"], dtype=torch.int64) for ex in batch]
170
+
171
+ !!!Training failed: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
172
+ Traceback (most recent call last):
173
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 593, in main
174
+ trainer.fit()
175
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/train.py", line 2284, in fit
176
+ for batch in self.train_loader:
177
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
178
+ data = self._next_data()
179
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
180
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
181
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 43, in fetch
182
+ return self.collate_fn(data)
183
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 201, in __call__
184
+ out['timestep'] = torch.stack(timestep_list, dim=0)
185
+ RuntimeError: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
186
+ wandb: WARNING The `quiet` argument to `wandb.run.finish()` is deprecated, use `wandb.Settings(quiet=...)` to set this instead.
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/requirements.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ termcolor==3.1.0
15
+ Werkzeug==3.1.3
16
+ Brotli==1.1.0
17
+ Farama-Notifications==0.0.4
18
+ MarkupSafe==2.1.5
19
+ PyYAML==6.0.2
20
+ absl-py==2.3.1
21
+ accelerate==1.10.1
22
+ ai2-molmo==0.0.0
23
+ aiofiles==24.1.0
24
+ aiohappyeyeballs==2.6.1
25
+ aiohttp==3.12.15
26
+ aiosignal==1.4.0
27
+ annotated-types==0.7.0
28
+ antlr4-python3-runtime==4.9.3
29
+ anyio==4.10.0
30
+ array_record==0.8.1
31
+ async-timeout==5.0.1
32
+ attrs==25.3.0
33
+ av==15.1.0
34
+ backports.tarfile==1.2.0
35
+ beaker-gantry==3.2.0
36
+ beaker-py==2.5.0
37
+ black==23.12.1
38
+ blinker==1.9.0
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ et_xmlfile==2.0.0
72
+ etils==1.13.0
73
+ evdev==1.9.2
74
+ exceptiongroup==1.3.0
75
+ face==24.0.0
76
+ fastapi==0.116.2
77
+ ffmpy==0.6.1
78
+ fiddle==0.3.0
79
+ filelock==3.13.1
80
+ Flask==3.1.2
81
+ fonttools==4.60.0
82
+ frozenlist==1.7.0
83
+ fsspec==2023.9.2
84
+ ftfy==6.3.1
85
+ gcsfs==2023.9.2
86
+ gitdb==4.0.12
87
+ GitPython==3.1.45
88
+ glom==24.11.0
89
+ google-api-core==2.25.1
90
+ google-auth==2.40.3
91
+ google-auth-oauthlib==1.2.2
92
+ google-cloud-core==2.4.3
93
+ google-cloud-storage==2.19.0
94
+ google-crc32c==1.7.1
95
+ google-resumable-media==2.7.2
96
+ googleapis-common-protos==1.70.0
97
+ gradio==5.46.0
98
+ gradio_client==1.13.0
99
+ graphviz==0.21
100
+ groovy==0.1.2
101
+ grpcio==1.75.0
102
+ gymnasium==0.29.1
103
+ h11==0.16.0
104
+ hf_transfer==0.1.9
105
+ hf-xet==1.1.10
106
+ httpcore==1.0.9
107
+ httpx==0.28.1
108
+ huggingface-hub==0.35.0
109
+ id==1.5.0
110
+ idna==3.10
111
+ imageio==2.37.0
112
+ imageio-ffmpeg==0.6.0
113
+ importlib_metadata==8.7.0
114
+ importlib_resources==6.5.2
115
+ iniconfig==2.1.0
116
+ inquirerpy==0.3.4
117
+ isort==5.12.0
118
+ itsdangerous==2.2.0
119
+ jaraco.classes==3.4.0
120
+ jaraco.context==6.0.1
121
+ jaraco.functools==4.3.0
122
+ jeepney==0.9.0
123
+ Jinja2==3.1.4
124
+ jiter==0.11.0
125
+ jmespath==1.0.1
126
+ joblib==1.5.2
127
+ jsonlines==4.0.0
128
+ keras==2.15.0
129
+ keyring==25.6.0
130
+ kiwisolver==1.4.9
131
+ latex2sympy2_extended==1.10.2
132
+ lerobot==0.3.4
133
+ Levenshtein==0.27.1
134
+ libcst==1.8.4
135
+ lightning-utilities==0.15.2
136
+ markdown-it-py==4.0.0
137
+ math-verify==0.8.0
138
+ matplotlib==3.10.6
139
+ mdurl==0.1.2
140
+ mergedeep==1.3.4
141
+ ml-dtypes==0.2.0
142
+ ml_dtypes==0.5.3
143
+ more-itertools==10.8.0
144
+ mpmath==1.3.0
145
+ msgspec==0.19.0
146
+ multidict==6.6.4
147
+ multiprocess==0.70.16
148
+ mypy==1.3.0
149
+ mypy_extensions==1.1.0
150
+ necessary==0.4.3
151
+ networkx==3.3
152
+ nh3==0.3.0
153
+ nltk==3.9.1
154
+ numpy==1.26.4
155
+ oauthlib==3.3.1
156
+ omegaconf==2.3.0
157
+ openai==1.108.0
158
+ opencv-python-headless==4.12.0.88
159
+ OpenEXR==3.4.0
160
+ openpyxl==3.1.5
161
+ orderly-set==5.5.0
162
+ orjson==3.11.3
163
+ packaging==25.0
164
+ pandas==2.3.2
165
+ pathspec==0.12.1
166
+ petname==2.6
167
+ pfzy==0.3.4
168
+ pillow==11.0.0
169
+ pip==25.2
170
+ platformdirs==4.4.0
171
+ pluggy==1.6.0
172
+ promise==2.3
173
+ prompt_toolkit==3.0.52
174
+ propcache==0.3.2
175
+ proto-plus==1.26.1
176
+ protobuf==4.21.12
177
+ protobuf==6.32.1
178
+ psutil==7.1.0
179
+ pyarrow==21.0.0
180
+ pyasn1==0.6.1
181
+ pyasn1_modules==0.4.2
182
+ pycparser==2.23
183
+ pydantic==2.11.9
184
+ pydantic_core==2.33.2
185
+ pydub==0.25.1
186
+ Pygments==2.19.2
187
+ pynput==1.8.1
188
+ pyparsing==3.2.4
189
+ pyproject_hooks==1.2.0
190
+ pyserial==3.5
191
+ pytest==8.4.2
192
+ pytest-sphinx==0.6.3
193
+ python-dateutil==2.9.0.post0
194
+ python-Levenshtein==0.27.1
195
+ python-multipart==0.0.20
196
+ python-xlib==0.33
197
+ pytorch-triton-rocm==3.4.0
198
+ pytz==2025.2
199
+ pyyaml-include==1.4.1
200
+ RapidFuzz==3.14.1
201
+ readme_renderer==44.0
202
+ regex==2025.9.1
203
+ requests==2.32.5
204
+ requests-oauthlib==2.0.0
205
+ requests-toolbelt==1.0.0
206
+ requirements-parser==0.13.0
207
+ rerun-sdk==0.22.1
208
+ rfc3986==2.0.0
209
+ rich==13.9.4
210
+ rsa==4.9.1
211
+ ruff==0.13.0
212
+ s3transfer==0.14.0
213
+ safehttpx==0.1.6
214
+ safetensors==0.6.2
215
+ scikit-learn==1.7.2
216
+ scipy==1.15.3
217
+ SecretStorage==3.4.0
218
+ semantic-version==2.10.0
219
+ sentencepiece==0.2.1
220
+ sentry-sdk==2.38.0
221
+ setuptools==78.1.1
222
+ shellingham==1.5.4
223
+ six==1.17.0
224
+ smart_open==7.3.1
225
+ smashed==0.21.5
226
+ smmap==5.0.2
227
+ sniffio==1.3.1
228
+ starlette==0.48.0
229
+ sympy==1.13.3
230
+ tensorboard==2.15.2
231
+ tensorboard==2.19.0
232
+ tensorflow==2.15.0
233
+ tensorflow-addons==0.23.0
234
+ tensorflow-datasets==4.9.3
235
+ tensorflow-estimator==2.15.0
236
+ tensorflow-graphics==2021.12.3
237
+ tensorflow-metadata==1.17.2
238
+ threadpoolctl==3.6.0
239
+ timm==1.0.19
240
+ tokenizers==0.22.0
241
+ toml==0.10.2
242
+ tomli==2.2.1
243
+ tomlkit==0.13.3
244
+ torch==2.8.0+rocm6.4
245
+ torchcodec==0.5
246
+ torchmetrics==1.8.2
247
+ torchvision==0.23.0+rocm6.4
248
+ tqdm==4.67.1
249
+ transformers==4.56.1
250
+ trimesh==4.8.2
251
+ trouting==0.3.3
252
+ twine==6.2.0
253
+ typeguard==2.13.3
254
+ typer==0.17.4
255
+ typing_extensions==4.15.0
256
+ typing-inspect==0.9.0
257
+ typing-inspection==0.4.1
258
+ tzdata==2025.2
259
+ urllib3==2.5.0
260
+ uvicorn==0.35.0
261
+ wandb==0.21.4
262
+ wcwidth==0.2.13
263
+ websockets==15.0.1
264
+ wheel==0.45.1
265
+ wrapt==1.14.2
266
+ xxhash==3.5.0
267
+ yarl==1.20.1
268
+ zipp==3.23.0
269
+ lerobot==0.3.4
270
+ minLoRA==0.1.0
271
+ autocommand==2.2.2
272
+ backports.tarfile==1.2.0
273
+ importlib_metadata==8.0.0
274
+ inflect==7.3.1
275
+ jaraco.collections==5.1.0
276
+ jaraco.context==5.3.0
277
+ jaraco.functools==4.0.1
278
+ jaraco.text==3.12.1
279
+ more-itertools==10.3.0
280
+ packaging==24.2
281
+ platformdirs==4.2.2
282
+ tomli==2.0.1
283
+ typeguard==4.3.0
284
+ typing_extensions==4.12.2
285
+ wheel==0.45.1
286
+ zipp==3.19.2
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-10-08T16:38:31.458924Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_l1_regression",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "l1_regression",
12
+ "--seq_len",
13
+ "1600",
14
+ "--ft_llm",
15
+ "--checkpoint",
16
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
17
+ "--device_train_microbatch_size",
18
+ "16",
19
+ "--global_batch_size",
20
+ "126",
21
+ "--dataset",
22
+ "vla_dataset_realworld",
23
+ "--llm_learning_rate",
24
+ "5e-5",
25
+ "--wandb_entity",
26
+ "henryeap",
27
+ "--wandb_project",
28
+ "a1-realworld",
29
+ "--wandb_run_name",
30
+ "cleandesk50",
31
+ "--real_world_vla_config_path",
32
+ "vla_config_realworld/vla_config_cleandesk50.yaml",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "49712a42d21a8c739a16ba5eeaec4a0d7b29ab80"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk50_l1_regression/wandb",
44
+ "host": "auh7-1b-gpu-188",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "52340371456"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606964736"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "7",
62
+ "uniqueId": "0x21a2e88d06c419dc",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "4",
75
+ "uniqueId": "0xa515afd8ced1d39d",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "2",
88
+ "uniqueId": "0x399226d2b2bfa544",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "6",
101
+ "uniqueId": "0xfa8b85a4625b04f",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "3",
114
+ "uniqueId": "0xf61ec17df11883bd",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "1",
127
+ "uniqueId": "0x9b5c1c302c8129f8",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "0",
140
+ "uniqueId": "0x3558c3014c813fdb",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "5",
153
+ "uniqueId": "0x137c9ede1bb1518e",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1760200645",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "2284",
177
+ "job_name": "mh_cleandesk50_l1_regression",
178
+ "job_nodelist": "auh7-1b-gpu-188",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1759941445",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "2284",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-188",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "2621518",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-188",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "da16lrefa3ue6fcq4audbkzfio2vskvf"
204
+ }
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_step":0,"_wandb":{"runtime":361},"_runtime":361.82454539,"System/Peak GPU Memory (MB)":35614.78125,"_timestamp":1.7599418591639297e+09}
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:31.69984144Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpzewm22rr/port-2621708.txt","pid":2621708,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-10-08T16:38:31.702272712Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2621708}
3
+ {"time":"2025-10-08T16:38:31.702674102Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2621708-2621877-1682313073/socket","Net":"unix"}}
4
+ {"time":"2025-10-08T16:38:31.710784598Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-10-08T16:38:31.724491738Z","level":"INFO","msg":"handleInformInit: received","streamId":"fqdwkc8m","id":"1(@)"}
6
+ {"time":"2025-10-08T16:38:33.081443256Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"fqdwkc8m","id":"1(@)"}
7
+ {"time":"2025-10-08T16:44:37.411729039Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"fqdwkc8m","id":"1(@)"}
8
+ {"time":"2025-10-08T16:44:37.41421139Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"fqdwkc8m","id":"1(@)"}
9
+ {"time":"2025-10-08T16:44:37.461669803Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-10-08T16:44:37.461695793Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-10-08T16:44:37.461702223Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-10-08T16:44:37.461709833Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-10-08T16:44:37.461743853Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-10-08T16:44:37.461747193Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-10-08T16:44:37.461760803Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-2621708-2621877-1682313073/socket","Net":"unix"}}
16
+ {"time":"2025-10-08T16:44:37.461786643Z","level":"INFO","msg":"server is closed"}
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:31.726288089Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:33.081399086Z","level":"INFO","msg":"stream: created new stream","id":"fqdwkc8m"}
3
+ {"time":"2025-10-08T16:38:33.081437966Z","level":"INFO","msg":"stream: started","id":"fqdwkc8m"}
4
+ {"time":"2025-10-08T16:38:33.081464946Z","level":"INFO","msg":"sender: started","stream_id":"fqdwkc8m"}
5
+ {"time":"2025-10-08T16:38:33.081464936Z","level":"INFO","msg":"writer: started","stream_id":"fqdwkc8m"}
6
+ {"time":"2025-10-08T16:38:33.081488306Z","level":"INFO","msg":"handler: started","stream_id":"fqdwkc8m"}
7
+ {"time":"2025-10-08T16:44:35.233389442Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.00059375}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:44:37.058187164Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:37.412033949Z","level":"INFO","msg":"stream: closing","id":"fqdwkc8m"}
10
+ {"time":"2025-10-08T16:44:37.412048989Z","level":"INFO","msg":"handler: closed","stream_id":"fqdwkc8m"}
11
+ {"time":"2025-10-08T16:44:37.4130748Z","level":"INFO","msg":"sender: closed","stream_id":"fqdwkc8m"}
12
+ {"time":"2025-10-08T16:44:37.41308173Z","level":"INFO","msg":"stream: closed","id":"fqdwkc8m"}
cleandesk50_l1_regression/wandb/wandb/run-20251008_163831-fqdwkc8m/logs/debug.log ADDED
File without changes
cleandesk_flow_matching/step11500-action-head/metadata.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb72b6306ce04d1beb20bb289509f00c39a40845ff7c4b36bf4deb4e83fe82a
3
+ size 1331
cleandesk_flow_matching/step12000-unsharded/config.yaml ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: cleandesk_20251005_163721
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ ff_out_size: null
64
+ additional_vocab_size: 128
65
+ new_embedding_init_range: 0.02
66
+ weight_tying: false
67
+ init_device: null
68
+ init_fn: normal
69
+ init_std: 0.02
70
+ init_cutoff_factor: null
71
+ norm_after: false
72
+ precision: amp_bf16
73
+ max_crops: 12
74
+ crop_mode: overlap-and-resize-c2
75
+ use_col_tokens: true
76
+ prompt_type: uber_model
77
+ system_prompt_kind: demo_or_style
78
+ message_formatting: role
79
+ always_start_with_space: true
80
+ multi_annotation_weighting: root_subsegments
81
+ default_inference_len: 65
82
+ overlap_margins:
83
+ - 4
84
+ - 4
85
+ pad_value: 0.0
86
+ image_padding_embed: pad_and_partial_pad
87
+ fix_image_padding: true
88
+ vit_layers:
89
+ - -2
90
+ - -9
91
+ image_pooling_h: 2
92
+ image_pooling_w: 2
93
+ image_pooling_2d: attention_meanq
94
+ image_projector: mlp
95
+ image_feature_dropout: 0.0
96
+ initializer_range: 0.02
97
+ normalize_input_embeds: false
98
+ use_position_ids: true
99
+ head_dim: null
100
+ action_tokenizer:
101
+ identifier: physical-intelligence/fast
102
+ tokenizer_dir: null
103
+ action_dim: 7
104
+ horizon: 8
105
+ tokenizer:
106
+ identifier: Qwen/Qwen2-7B
107
+ tokenizer_dir: null
108
+ pad_tokenizer: true
109
+ moe_num_experts: 8
110
+ moe_top_k: 2
111
+ moe_mlp_impl: sparse
112
+ moe_log_expert_assignment: false
113
+ moe_shared_expert: false
114
+ moe_lbl_in_fp32: false
115
+ moe_interleave: false
116
+ moe_loss_weight: 0.1
117
+ moe_zloss_weight: null
118
+ moe_dropless: true
119
+ moe_capacity_factor: 1.25
120
+ action_head: flow_matching
121
+ num_diffusion_steps: 1000
122
+ num_diffusion_inference_steps: 30
123
+ use_proprio: true
124
+ action_head_dit_hidden_size: 1152
125
+ action_head_dit_depth: 28
126
+ action_head_dit_num_heads: 16
127
+ llm_causal_attention: false
128
+ action_use_left_eef: true
129
+ action_use_mobile_base: false
130
+ allow_resume: false
131
+ ft_llm: true
132
+ ft_vit: false
133
+ ft_connector: false
134
+ ft_embedding: lm_head
135
+ lora: false
136
+ use_lora: true
137
+ lora_rank: 8
138
+ lora_llm: false
139
+ lora_vit: false
140
+ lora_connector: false
141
+ early_exit: false
142
+ train_exit_random_layer: false
143
+ optimizer:
144
+ name: adamw
145
+ learning_rate: 0.0001
146
+ weight_decay: 0.01
147
+ betas:
148
+ - 0.9
149
+ - 0.95
150
+ eps: 1.0e-05
151
+ connector_learning_rate: 0.0002
152
+ vit_learning_rate: 6.0e-06
153
+ llm_learning_rate: 5.0e-05
154
+ connector_weight_decay: 0.0
155
+ vit_weight_decay: 0.0
156
+ llm_weight_decay: 0.0
157
+ connector_betas:
158
+ - 0.9
159
+ - 0.95
160
+ vit_betas:
161
+ - 0.9
162
+ - 0.95
163
+ llm_betas:
164
+ - 0.9
165
+ - 0.95
166
+ connector_eps: 1.0e-06
167
+ vit_eps: 1.0e-06
168
+ llm_eps: 1.0e-06
169
+ metrics_log_interval: 20
170
+ scheduler:
171
+ name: multimodal
172
+ units: steps
173
+ t_warmup: 100
174
+ t_max: null
175
+ alpha_f: 0.1
176
+ connector_t_warmup: 200
177
+ vit_t_warmup: 2000
178
+ llm_t_warmup: 2000
179
+ grad_clip_warmup_steps: null
180
+ grad_clip_warmup_factor: null
181
+ warmup_min_lr: 0.0
182
+ data:
183
+ dataset: vla_dataset_realworld
184
+ mixture: null
185
+ root_size_mixture: null
186
+ split: train
187
+ seed: 95818
188
+ shuffle_messages: false
189
+ pad: to_max
190
+ sequence_length: 1600
191
+ shuffle: true
192
+ for_inference: false
193
+ multi_modal: torch
194
+ num_workers: 0
195
+ drop_last: true
196
+ pin_memory: true
197
+ prefetch_factor: null
198
+ persistent_workers: false
199
+ timeout: 0
200
+ rlds_dataset_name: libero_4_task_suites_no_noops
201
+ rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk
202
+ use_wrist_image: true
203
+ use_proprio: true
204
+ rlds_shuffle_buffer_size: 100000
205
+ rlds_traj_threads: 8
206
+ rlds_read_threads: 8
207
+ lerobot_episode_index_start: null
208
+ lerobot_episode_index_end: null
209
+ restore_dataloader: true
210
+ fast_forward_batches: null
211
+ evaluators:
212
+ - label: val
213
+ data:
214
+ dataset: vla_dataset_realworld
215
+ mixture: null
216
+ root_size_mixture: null
217
+ split: validation
218
+ seed: null
219
+ shuffle_messages: false
220
+ pad: to_max
221
+ sequence_length: 1600
222
+ shuffle: false
223
+ for_inference: false
224
+ multi_modal: torch
225
+ num_workers: 0
226
+ drop_last: true
227
+ pin_memory: true
228
+ prefetch_factor: null
229
+ persistent_workers: true
230
+ timeout: 0
231
+ rlds_dataset_name: libero_4_task_suites_no_noops
232
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
233
+ use_wrist_image: true
234
+ use_proprio: true
235
+ rlds_shuffle_buffer_size: 256000
236
+ rlds_traj_threads: 8
237
+ rlds_read_threads: 8
238
+ lerobot_episode_index_start: 353
239
+ lerobot_episode_index_end: 765
240
+ device_eval_batch_size: null
241
+ subset_num_batches: 64
242
+ max_examples: null
243
+ max_new_tokens: 448
244
+ mm_evaluator: null
245
+ save_dir: null
246
+ save_to_checkpoint_dir: false
247
+ eval_name: null
248
+ skip_if_metrics_cached: true
249
+ eval_interval: 0
250
+ inf_eval_interval: -1
251
+ inf_evaluators: []
252
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_flow_matching
253
+ remote_save_folder: null
254
+ canceled_check_interval: 50
255
+ save_interval: 500
256
+ save_interval_unsharded: 500
257
+ save_interval_ephemeral: null
258
+ save_interval_action_head: 500
259
+ save_num_checkpoints_to_keep: 1
260
+ save_num_unsharded_checkpoints_to_keep: 1
261
+ save_num_action_head_checkpoints_to_keep: 2
262
+ save_overwrite: true
263
+ force_save_unsharded: false
264
+ no_pre_train_checkpoint: true
265
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
266
+ load_model_config: null
267
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
268
+ load_path: null
269
+ load_path_sharded_checkpointer: null
270
+ reset_optimizer_state: false
271
+ reset_trainer_state: false
272
+ save_dataloader_state: false
273
+ reset_dataloader_state: false
274
+ keep_lr_on_load: true
275
+ sharded_checkpointer: torch_legacy
276
+ max_duration: 500000
277
+ global_train_batch_size: 126
278
+ device_train_batch_size: 15
279
+ device_train_microbatch_size: 16
280
+ device_eval_batch_size: 4
281
+ eval_subset_num_batches: -1
282
+ eval_on_load: false
283
+ device_inf_eval_batch_size: 16
284
+ inf_eval_subset_num_batches: -1
285
+ device_train_grad_accum: 0
286
+ max_grad_norm: 1.0
287
+ multi_component_grad_norm: true
288
+ batch_divisor: global_batch
289
+ max_grad_norm_ratio: null
290
+ precision: amp_bf16
291
+ wandb:
292
+ project: a1-realworld
293
+ entity: henryeap
294
+ group: null
295
+ name: cleandesk_20251005_163721
296
+ tags:
297
+ - watching
298
+ log_artifacts: false
299
+ rank_zero_only: true
300
+ log_interval: 1
301
+ speed_monitor:
302
+ window_size: 20
303
+ gpu_flops_available: null
304
+ console_log_interval: 1
305
+ gen1_gc_interval: 1
306
+ compile: null
307
+ fsdp:
308
+ use_orig_params: true
309
+ sharding_strategy: FULL_SHARD
310
+ wrapping_strategy: by_block_and_size
311
+ precision: float
312
+ hybrid_sharding_num_model_replicas: null
313
+ softmax_auxiliary_loss: true
314
+ softmax_auxiliary_loss_scale: 0.0001
315
+ time_limit: null
316
+ extra_steps_after_cancel: 10
317
+ python_profiling: false
318
+ torch_profiling: false
319
+ stop_at: 500000
320
+ stop_after: null
321
+ activation_checkpointing: whole_layer
322
+ fused_loss: null
cleandesk_flow_matching/step12000/config.yaml ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: cleandesk_20251005_163721
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ ff_out_size: null
64
+ additional_vocab_size: 128
65
+ new_embedding_init_range: 0.02
66
+ weight_tying: false
67
+ init_device: null
68
+ init_fn: normal
69
+ init_std: 0.02
70
+ init_cutoff_factor: null
71
+ norm_after: false
72
+ precision: amp_bf16
73
+ max_crops: 12
74
+ crop_mode: overlap-and-resize-c2
75
+ use_col_tokens: true
76
+ prompt_type: uber_model
77
+ system_prompt_kind: demo_or_style
78
+ message_formatting: role
79
+ always_start_with_space: true
80
+ multi_annotation_weighting: root_subsegments
81
+ default_inference_len: 65
82
+ overlap_margins:
83
+ - 4
84
+ - 4
85
+ pad_value: 0.0
86
+ image_padding_embed: pad_and_partial_pad
87
+ fix_image_padding: true
88
+ vit_layers:
89
+ - -2
90
+ - -9
91
+ image_pooling_h: 2
92
+ image_pooling_w: 2
93
+ image_pooling_2d: attention_meanq
94
+ image_projector: mlp
95
+ image_feature_dropout: 0.0
96
+ initializer_range: 0.02
97
+ normalize_input_embeds: false
98
+ use_position_ids: true
99
+ head_dim: null
100
+ action_tokenizer:
101
+ identifier: physical-intelligence/fast
102
+ tokenizer_dir: null
103
+ action_dim: 7
104
+ horizon: 8
105
+ tokenizer:
106
+ identifier: Qwen/Qwen2-7B
107
+ tokenizer_dir: null
108
+ pad_tokenizer: true
109
+ moe_num_experts: 8
110
+ moe_top_k: 2
111
+ moe_mlp_impl: sparse
112
+ moe_log_expert_assignment: false
113
+ moe_shared_expert: false
114
+ moe_lbl_in_fp32: false
115
+ moe_interleave: false
116
+ moe_loss_weight: 0.1
117
+ moe_zloss_weight: null
118
+ moe_dropless: true
119
+ moe_capacity_factor: 1.25
120
+ action_head: flow_matching
121
+ num_diffusion_steps: 1000
122
+ num_diffusion_inference_steps: 30
123
+ use_proprio: true
124
+ action_head_dit_hidden_size: 1152
125
+ action_head_dit_depth: 28
126
+ action_head_dit_num_heads: 16
127
+ llm_causal_attention: false
128
+ action_use_left_eef: true
129
+ action_use_mobile_base: false
130
+ allow_resume: false
131
+ ft_llm: true
132
+ ft_vit: false
133
+ ft_connector: false
134
+ ft_embedding: lm_head
135
+ lora: false
136
+ use_lora: true
137
+ lora_rank: 8
138
+ lora_llm: false
139
+ lora_vit: false
140
+ lora_connector: false
141
+ early_exit: false
142
+ train_exit_random_layer: false
143
+ optimizer:
144
+ name: adamw
145
+ learning_rate: 0.0001
146
+ weight_decay: 0.01
147
+ betas:
148
+ - 0.9
149
+ - 0.95
150
+ eps: 1.0e-05
151
+ connector_learning_rate: 0.0002
152
+ vit_learning_rate: 6.0e-06
153
+ llm_learning_rate: 5.0e-05
154
+ connector_weight_decay: 0.0
155
+ vit_weight_decay: 0.0
156
+ llm_weight_decay: 0.0
157
+ connector_betas:
158
+ - 0.9
159
+ - 0.95
160
+ vit_betas:
161
+ - 0.9
162
+ - 0.95
163
+ llm_betas:
164
+ - 0.9
165
+ - 0.95
166
+ connector_eps: 1.0e-06
167
+ vit_eps: 1.0e-06
168
+ llm_eps: 1.0e-06
169
+ metrics_log_interval: 20
170
+ scheduler:
171
+ name: multimodal
172
+ units: steps
173
+ t_warmup: 100
174
+ t_max: null
175
+ alpha_f: 0.1
176
+ connector_t_warmup: 200
177
+ vit_t_warmup: 2000
178
+ llm_t_warmup: 2000
179
+ grad_clip_warmup_steps: null
180
+ grad_clip_warmup_factor: null
181
+ warmup_min_lr: 0.0
182
+ data:
183
+ dataset: vla_dataset_realworld
184
+ mixture: null
185
+ root_size_mixture: null
186
+ split: train
187
+ seed: 95818
188
+ shuffle_messages: false
189
+ pad: to_max
190
+ sequence_length: 1600
191
+ shuffle: true
192
+ for_inference: false
193
+ multi_modal: torch
194
+ num_workers: 0
195
+ drop_last: true
196
+ pin_memory: true
197
+ prefetch_factor: null
198
+ persistent_workers: false
199
+ timeout: 0
200
+ rlds_dataset_name: libero_4_task_suites_no_noops
201
+ rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk
202
+ use_wrist_image: true
203
+ use_proprio: true
204
+ rlds_shuffle_buffer_size: 100000
205
+ rlds_traj_threads: 8
206
+ rlds_read_threads: 8
207
+ lerobot_episode_index_start: null
208
+ lerobot_episode_index_end: null
209
+ restore_dataloader: true
210
+ fast_forward_batches: null
211
+ evaluators:
212
+ - label: val
213
+ data:
214
+ dataset: vla_dataset_realworld
215
+ mixture: null
216
+ root_size_mixture: null
217
+ split: validation
218
+ seed: null
219
+ shuffle_messages: false
220
+ pad: to_max
221
+ sequence_length: 1600
222
+ shuffle: false
223
+ for_inference: false
224
+ multi_modal: torch
225
+ num_workers: 0
226
+ drop_last: true
227
+ pin_memory: true
228
+ prefetch_factor: null
229
+ persistent_workers: true
230
+ timeout: 0
231
+ rlds_dataset_name: libero_4_task_suites_no_noops
232
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
233
+ use_wrist_image: true
234
+ use_proprio: true
235
+ rlds_shuffle_buffer_size: 256000
236
+ rlds_traj_threads: 8
237
+ rlds_read_threads: 8
238
+ lerobot_episode_index_start: 353
239
+ lerobot_episode_index_end: 765
240
+ device_eval_batch_size: null
241
+ subset_num_batches: 64
242
+ max_examples: null
243
+ max_new_tokens: 448
244
+ mm_evaluator: null
245
+ save_dir: null
246
+ save_to_checkpoint_dir: false
247
+ eval_name: null
248
+ skip_if_metrics_cached: true
249
+ eval_interval: 0
250
+ inf_eval_interval: -1
251
+ inf_evaluators: []
252
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_flow_matching
253
+ remote_save_folder: null
254
+ canceled_check_interval: 50
255
+ save_interval: 500
256
+ save_interval_unsharded: 500
257
+ save_interval_ephemeral: null
258
+ save_interval_action_head: 500
259
+ save_num_checkpoints_to_keep: 1
260
+ save_num_unsharded_checkpoints_to_keep: 1
261
+ save_num_action_head_checkpoints_to_keep: 2
262
+ save_overwrite: true
263
+ force_save_unsharded: false
264
+ no_pre_train_checkpoint: true
265
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
266
+ load_model_config: null
267
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
268
+ load_path: null
269
+ load_path_sharded_checkpointer: null
270
+ reset_optimizer_state: false
271
+ reset_trainer_state: false
272
+ save_dataloader_state: false
273
+ reset_dataloader_state: false
274
+ keep_lr_on_load: true
275
+ sharded_checkpointer: torch_legacy
276
+ max_duration: 500000
277
+ global_train_batch_size: 126
278
+ device_train_batch_size: 15
279
+ device_train_microbatch_size: 16
280
+ device_eval_batch_size: 4
281
+ eval_subset_num_batches: -1
282
+ eval_on_load: false
283
+ device_inf_eval_batch_size: 16
284
+ inf_eval_subset_num_batches: -1
285
+ device_train_grad_accum: 0
286
+ max_grad_norm: 1.0
287
+ multi_component_grad_norm: true
288
+ batch_divisor: global_batch
289
+ max_grad_norm_ratio: null
290
+ precision: amp_bf16
291
+ wandb:
292
+ project: a1-realworld
293
+ entity: henryeap
294
+ group: null
295
+ name: cleandesk_20251005_163721
296
+ tags:
297
+ - watching
298
+ log_artifacts: false
299
+ rank_zero_only: true
300
+ log_interval: 1
301
+ speed_monitor:
302
+ window_size: 20
303
+ gpu_flops_available: null
304
+ console_log_interval: 1
305
+ gen1_gc_interval: 1
306
+ compile: null
307
+ fsdp:
308
+ use_orig_params: true
309
+ sharding_strategy: FULL_SHARD
310
+ wrapping_strategy: by_block_and_size
311
+ precision: float
312
+ hybrid_sharding_num_model_replicas: null
313
+ softmax_auxiliary_loss: true
314
+ softmax_auxiliary_loss_scale: 0.0001
315
+ time_limit: null
316
+ extra_steps_after_cancel: 10
317
+ python_profiling: false
318
+ torch_profiling: false
319
+ stop_at: 500000
320
+ stop_after: null
321
+ activation_checkpointing: whole_layer
322
+ fused_loss: null
cleandesk_flow_matching/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-05T16:38:02.602917026Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-05T16:38:03.762826737Z","level":"INFO","msg":"stream: created new stream","id":"gqyapbwp"}
3
+ {"time":"2025-10-05T16:38:03.762885338Z","level":"INFO","msg":"stream: started","id":"gqyapbwp"}
4
+ {"time":"2025-10-05T16:38:03.762906828Z","level":"INFO","msg":"writer: started","stream_id":"gqyapbwp"}
5
+ {"time":"2025-10-05T16:38:03.762906838Z","level":"INFO","msg":"sender: started","stream_id":"gqyapbwp"}
6
+ {"time":"2025-10-05T16:38:03.762920708Z","level":"INFO","msg":"handler: started","stream_id":"gqyapbwp"}
7
+ {"time":"2025-10-06T19:04:19.555419176Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
8
+ {"time":"2025-10-06T20:03:34.950654374Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
9
+ {"time":"2025-10-07T15:02:38.499153299Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
10
+ {"time":"2025-10-07T21:28:37.643147942Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-10-07T22:22:44.986859439Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-10-07T23:26:33.122893273Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/henryeap/a1-realworld/gqyapbwp/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
cleandesk_flow_matching/wandb/wandb/debug.log ADDED
File without changes
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/requirements.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ termcolor==3.1.0
15
+ Werkzeug==3.1.3
16
+ Brotli==1.1.0
17
+ Farama-Notifications==0.0.4
18
+ MarkupSafe==2.1.5
19
+ PyYAML==6.0.2
20
+ absl-py==2.3.1
21
+ accelerate==1.10.1
22
+ ai2-molmo==0.0.0
23
+ aiofiles==24.1.0
24
+ aiohappyeyeballs==2.6.1
25
+ aiohttp==3.12.15
26
+ aiosignal==1.4.0
27
+ annotated-types==0.7.0
28
+ antlr4-python3-runtime==4.9.3
29
+ anyio==4.10.0
30
+ array_record==0.8.1
31
+ async-timeout==5.0.1
32
+ attrs==25.3.0
33
+ av==15.1.0
34
+ backports.tarfile==1.2.0
35
+ beaker-gantry==3.2.0
36
+ beaker-py==2.5.0
37
+ black==23.12.1
38
+ blinker==1.9.0
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ et_xmlfile==2.0.0
72
+ etils==1.13.0
73
+ evdev==1.9.2
74
+ exceptiongroup==1.3.0
75
+ face==24.0.0
76
+ fastapi==0.116.2
77
+ ffmpy==0.6.1
78
+ fiddle==0.3.0
79
+ filelock==3.13.1
80
+ Flask==3.1.2
81
+ fonttools==4.60.0
82
+ frozenlist==1.7.0
83
+ fsspec==2023.9.2
84
+ ftfy==6.3.1
85
+ gcsfs==2023.9.2
86
+ gitdb==4.0.12
87
+ GitPython==3.1.45
88
+ glom==24.11.0
89
+ google-api-core==2.25.1
90
+ google-auth==2.40.3
91
+ google-auth-oauthlib==1.2.2
92
+ google-cloud-core==2.4.3
93
+ google-cloud-storage==2.19.0
94
+ google-crc32c==1.7.1
95
+ google-resumable-media==2.7.2
96
+ googleapis-common-protos==1.70.0
97
+ gradio==5.46.0
98
+ gradio_client==1.13.0
99
+ graphviz==0.21
100
+ groovy==0.1.2
101
+ grpcio==1.75.0
102
+ gymnasium==0.29.1
103
+ h11==0.16.0
104
+ hf_transfer==0.1.9
105
+ hf-xet==1.1.10
106
+ httpcore==1.0.9
107
+ httpx==0.28.1
108
+ huggingface-hub==0.35.0
109
+ id==1.5.0
110
+ idna==3.10
111
+ imageio==2.37.0
112
+ imageio-ffmpeg==0.6.0
113
+ importlib_metadata==8.7.0
114
+ importlib_resources==6.5.2
115
+ iniconfig==2.1.0
116
+ inquirerpy==0.3.4
117
+ isort==5.12.0
118
+ itsdangerous==2.2.0
119
+ jaraco.classes==3.4.0
120
+ jaraco.context==6.0.1
121
+ jaraco.functools==4.3.0
122
+ jeepney==0.9.0
123
+ Jinja2==3.1.4
124
+ jiter==0.11.0
125
+ jmespath==1.0.1
126
+ joblib==1.5.2
127
+ jsonlines==4.0.0
128
+ keras==2.15.0
129
+ keyring==25.6.0
130
+ kiwisolver==1.4.9
131
+ latex2sympy2_extended==1.10.2
132
+ lerobot==0.3.4
133
+ Levenshtein==0.27.1
134
+ libcst==1.8.4
135
+ lightning-utilities==0.15.2
136
+ markdown-it-py==4.0.0
137
+ math-verify==0.8.0
138
+ matplotlib==3.10.6
139
+ mdurl==0.1.2
140
+ mergedeep==1.3.4
141
+ ml-dtypes==0.2.0
142
+ ml_dtypes==0.5.3
143
+ more-itertools==10.8.0
144
+ mpmath==1.3.0
145
+ msgspec==0.19.0
146
+ multidict==6.6.4
147
+ multiprocess==0.70.16
148
+ mypy==1.3.0
149
+ mypy_extensions==1.1.0
150
+ necessary==0.4.3
151
+ networkx==3.3
152
+ nh3==0.3.0
153
+ nltk==3.9.1
154
+ numpy==1.26.4
155
+ oauthlib==3.3.1
156
+ omegaconf==2.3.0
157
+ openai==1.108.0
158
+ opencv-python-headless==4.12.0.88
159
+ OpenEXR==3.4.0
160
+ openpyxl==3.1.5
161
+ orderly-set==5.5.0
162
+ orjson==3.11.3
163
+ packaging==25.0
164
+ pandas==2.3.2
165
+ pathspec==0.12.1
166
+ petname==2.6
167
+ pfzy==0.3.4
168
+ pillow==11.0.0
169
+ pip==25.2
170
+ platformdirs==4.4.0
171
+ pluggy==1.6.0
172
+ promise==2.3
173
+ prompt_toolkit==3.0.52
174
+ propcache==0.3.2
175
+ proto-plus==1.26.1
176
+ protobuf==4.21.12
177
+ protobuf==6.32.1
178
+ psutil==7.1.0
179
+ pyarrow==21.0.0
180
+ pyasn1==0.6.1
181
+ pyasn1_modules==0.4.2
182
+ pycparser==2.23
183
+ pydantic==2.11.9
184
+ pydantic_core==2.33.2
185
+ pydub==0.25.1
186
+ Pygments==2.19.2
187
+ pynput==1.8.1
188
+ pyparsing==3.2.4
189
+ pyproject_hooks==1.2.0
190
+ pyserial==3.5
191
+ pytest==8.4.2
192
+ pytest-sphinx==0.6.3
193
+ python-dateutil==2.9.0.post0
194
+ python-Levenshtein==0.27.1
195
+ python-multipart==0.0.20
196
+ python-xlib==0.33
197
+ pytorch-triton-rocm==3.4.0
198
+ pytz==2025.2
199
+ pyyaml-include==1.4.1
200
+ RapidFuzz==3.14.1
201
+ readme_renderer==44.0
202
+ regex==2025.9.1
203
+ requests==2.32.5
204
+ requests-oauthlib==2.0.0
205
+ requests-toolbelt==1.0.0
206
+ requirements-parser==0.13.0
207
+ rerun-sdk==0.22.1
208
+ rfc3986==2.0.0
209
+ rich==13.9.4
210
+ rsa==4.9.1
211
+ ruff==0.13.0
212
+ s3transfer==0.14.0
213
+ safehttpx==0.1.6
214
+ safetensors==0.6.2
215
+ scikit-learn==1.7.2
216
+ scipy==1.15.3
217
+ SecretStorage==3.4.0
218
+ semantic-version==2.10.0
219
+ sentencepiece==0.2.1
220
+ sentry-sdk==2.38.0
221
+ setuptools==78.1.1
222
+ shellingham==1.5.4
223
+ six==1.17.0
224
+ smart_open==7.3.1
225
+ smashed==0.21.5
226
+ smmap==5.0.2
227
+ sniffio==1.3.1
228
+ starlette==0.48.0
229
+ sympy==1.13.3
230
+ tensorboard==2.15.2
231
+ tensorboard==2.19.0
232
+ tensorflow==2.15.0
233
+ tensorflow-addons==0.23.0
234
+ tensorflow-datasets==4.9.3
235
+ tensorflow-estimator==2.15.0
236
+ tensorflow-graphics==2021.12.3
237
+ tensorflow-metadata==1.17.2
238
+ threadpoolctl==3.6.0
239
+ timm==1.0.19
240
+ tokenizers==0.22.0
241
+ toml==0.10.2
242
+ tomli==2.2.1
243
+ tomlkit==0.13.3
244
+ torch==2.8.0+rocm6.4
245
+ torchcodec==0.5
246
+ torchmetrics==1.8.2
247
+ torchvision==0.23.0+rocm6.4
248
+ tqdm==4.67.1
249
+ transformers==4.56.1
250
+ trimesh==4.8.2
251
+ trouting==0.3.3
252
+ twine==6.2.0
253
+ typeguard==2.13.3
254
+ typer==0.17.4
255
+ typing_extensions==4.15.0
256
+ typing-inspect==0.9.0
257
+ typing-inspection==0.4.1
258
+ tzdata==2025.2
259
+ urllib3==2.5.0
260
+ uvicorn==0.35.0
261
+ wandb==0.21.4
262
+ wcwidth==0.2.13
263
+ websockets==15.0.1
264
+ wheel==0.45.1
265
+ wrapt==1.14.2
266
+ xxhash==3.5.0
267
+ yarl==1.20.1
268
+ zipp==3.23.0
269
+ lerobot==0.3.4
270
+ minLoRA==0.1.0
271
+ autocommand==2.2.2
272
+ backports.tarfile==1.2.0
273
+ importlib_metadata==8.0.0
274
+ inflect==7.3.1
275
+ jaraco.collections==5.1.0
276
+ jaraco.context==5.3.0
277
+ jaraco.functools==4.0.1
278
+ jaraco.text==3.12.1
279
+ more-itertools==10.3.0
280
+ packaging==24.2
281
+ platformdirs==4.2.2
282
+ tomli==2.0.1
283
+ typeguard==4.3.0
284
+ typing_extensions==4.12.2
285
+ wheel==0.45.1
286
+ zipp==3.19.2
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-10-05T16:38:02.136539Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_flow_matching",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "flow_matching",
12
+ "--seq_len",
13
+ "1600",
14
+ "--ft_llm",
15
+ "--checkpoint",
16
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
17
+ "--device_train_microbatch_size",
18
+ "16",
19
+ "--global_batch_size",
20
+ "126",
21
+ "--dataset",
22
+ "vla_dataset_realworld",
23
+ "--llm_learning_rate",
24
+ "5e-5",
25
+ "--wandb_entity",
26
+ "henryeap",
27
+ "--wandb_project",
28
+ "a1-realworld",
29
+ "--wandb_run_name",
30
+ "cleandesk",
31
+ "--real_world_vla_config_path",
32
+ "vla_config_realworld/vla_config_cleandesk.yaml",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "5071f59d87c6a976691323cbac66d7a988b0b4e7"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_flow_matching/wandb",
44
+ "host": "auh7-1b-gpu-320",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "3778763694080",
53
+ "used": "55512412160"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606968832"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "0",
62
+ "uniqueId": "0xdc567fc68d1a0c91",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "1",
75
+ "uniqueId": "0xc976bbc2ad247ea6",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "5",
88
+ "uniqueId": "0xc7fbd07780c2f202",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "7",
101
+ "uniqueId": "0xe56b0d719426d5a8",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "3",
114
+ "uniqueId": "0xa7a04689129eefa4",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "4",
127
+ "uniqueId": "0xec560c9e435b50ba",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "2",
140
+ "uniqueId": "0xcbcb7103099a436c",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "6",
153
+ "uniqueId": "0x5bb2d7fad259574f",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1759941420",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "2281",
177
+ "job_name": "mh_cleandesk_flow_matching",
178
+ "job_nodelist": "auh7-1b-gpu-320",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1759682220",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "2281",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-320",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "561699",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-320",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "cg68x37yky6rbl9tr7pshd5fx8s61qiy"
204
+ }
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug-core.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2025-10-05T16:38:02.386747526Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpn27ektbq/port-561890.txt","pid":561890,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-10-05T16:38:02.388360677Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":561890}
3
+ {"time":"2025-10-05T16:38:02.389267188Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-561890-562080-1724540830/socket","Net":"unix"}}
4
+ {"time":"2025-10-05T16:38:02.586570534Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-10-05T16:38:02.600696794Z","level":"INFO","msg":"handleInformInit: received","streamId":"gqyapbwp","id":"1(@)"}
6
+ {"time":"2025-10-05T16:38:03.762891268Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"gqyapbwp","id":"1(@)"}
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-05T16:38:02.602917026Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-05T16:38:03.762826737Z","level":"INFO","msg":"stream: created new stream","id":"gqyapbwp"}
3
+ {"time":"2025-10-05T16:38:03.762885338Z","level":"INFO","msg":"stream: started","id":"gqyapbwp"}
4
+ {"time":"2025-10-05T16:38:03.762906828Z","level":"INFO","msg":"writer: started","stream_id":"gqyapbwp"}
5
+ {"time":"2025-10-05T16:38:03.762906838Z","level":"INFO","msg":"sender: started","stream_id":"gqyapbwp"}
6
+ {"time":"2025-10-05T16:38:03.762920708Z","level":"INFO","msg":"handler: started","stream_id":"gqyapbwp"}
7
+ {"time":"2025-10-06T19:04:19.555419176Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
8
+ {"time":"2025-10-06T20:03:34.950654374Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
9
+ {"time":"2025-10-07T15:02:38.499153299Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
10
+ {"time":"2025-10-07T21:28:37.643147942Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-10-07T22:22:44.986859439Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-10-07T23:26:33.122893273Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/henryeap/a1-realworld/gqyapbwp/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
cleandesk_flow_matching/wandb/wandb/run-20251005_163802-gqyapbwp/logs/debug.log ADDED
File without changes
cleandesk_l1_regression/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:32.19998745Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:33.339827981Z","level":"INFO","msg":"stream: created new stream","id":"76mxu43t"}
3
+ {"time":"2025-10-08T16:38:33.339874102Z","level":"INFO","msg":"stream: started","id":"76mxu43t"}
4
+ {"time":"2025-10-08T16:38:33.339905492Z","level":"INFO","msg":"handler: started","stream_id":"76mxu43t"}
5
+ {"time":"2025-10-08T16:38:33.339893552Z","level":"INFO","msg":"writer: started","stream_id":"76mxu43t"}
6
+ {"time":"2025-10-08T16:38:33.339947873Z","level":"INFO","msg":"sender: started","stream_id":"76mxu43t"}
7
+ {"time":"2025-10-08T16:43:58.756754711Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.006574606}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:43:59.766243448Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:00.127335826Z","level":"INFO","msg":"stream: closing","id":"76mxu43t"}
10
+ {"time":"2025-10-08T16:44:00.127349836Z","level":"INFO","msg":"handler: closed","stream_id":"76mxu43t"}
11
+ {"time":"2025-10-08T16:44:00.128408003Z","level":"INFO","msg":"sender: closed","stream_id":"76mxu43t"}
12
+ {"time":"2025-10-08T16:44:00.128424754Z","level":"INFO","msg":"stream: closed","id":"76mxu43t"}
cleandesk_l1_regression/wandb/wandb/debug.log ADDED
File without changes
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/config.yaml ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.21.4
4
+ e:
5
+ 9zghejqbkg668a368vduhoyzhbv4wgq6:
6
+ args:
7
+ - qwen2_7b
8
+ - save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_l1_regression
9
+ - --vision_backbone
10
+ - openai
11
+ - --action_head
12
+ - l1_regression
13
+ - --seq_len
14
+ - "1600"
15
+ - --ft_llm
16
+ - --checkpoint
17
+ - /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
18
+ - --device_train_microbatch_size
19
+ - "16"
20
+ - --global_batch_size
21
+ - "126"
22
+ - --dataset
23
+ - vla_dataset_realworld
24
+ - --llm_learning_rate
25
+ - "5e-5"
26
+ - --wandb_entity
27
+ - henryeap
28
+ - --wandb_project
29
+ - a1-realworld
30
+ - --wandb_run_name
31
+ - cleandesk
32
+ - --real_world_vla_config_path
33
+ - vla_config_realworld/vla_config_cleandesk.yaml
34
+ - --save_overwrite
35
+ codePath: launch_scripts/train_vla.py
36
+ codePathLocal: launch_scripts/train_vla.py
37
+ cpu_count: 64
38
+ cpu_count_logical: 128
39
+ disk:
40
+ /:
41
+ total: "470343073792"
42
+ used: "50668195840"
43
+ email: ihenrykwok@outlook.com
44
+ executable: /vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10
45
+ git:
46
+ commit: 49712a42d21a8c739a16ba5eeaec4a0d7b29ab80
47
+ remote: https://github.com/Spatialtemporal-AI/A1.git
48
+ gpu: Instinct MI210
49
+ gpu_amd:
50
+ - id: "3"
51
+ maxPower: "300.0"
52
+ mclkRange: 400Mhz - 1600Mhz
53
+ model: "0x740f"
54
+ performanceLevel: auto
55
+ sclkRange: 500Mhz - 1700Mhz
56
+ series: Instinct MI210
57
+ sku: D67301V
58
+ uniqueId: "0x62b25d667064a7ff"
59
+ vbiosVersion: 113-D67301V-073
60
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
61
+ - id: "6"
62
+ maxPower: "300.0"
63
+ mclkRange: 400Mhz - 1600Mhz
64
+ model: "0x740f"
65
+ performanceLevel: auto
66
+ sclkRange: 500Mhz - 1700Mhz
67
+ series: Instinct MI210
68
+ sku: D67301V
69
+ uniqueId: "0xbdb93fac1aa97618"
70
+ vbiosVersion: 113-D67301V-073
71
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
72
+ - id: "7"
73
+ maxPower: "300.0"
74
+ mclkRange: 400Mhz - 1600Mhz
75
+ model: "0x740f"
76
+ performanceLevel: auto
77
+ sclkRange: 500Mhz - 1700Mhz
78
+ series: Instinct MI210
79
+ sku: D67301V
80
+ uniqueId: "0x850c5a1ff5d005be"
81
+ vbiosVersion: 113-D67301V-073
82
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
83
+ - id: "2"
84
+ maxPower: "300.0"
85
+ mclkRange: 400Mhz - 1600Mhz
86
+ model: "0x740f"
87
+ performanceLevel: auto
88
+ sclkRange: 500Mhz - 1700Mhz
89
+ series: Instinct MI210
90
+ sku: D67301V
91
+ uniqueId: "0x7a3e2781f4182456"
92
+ vbiosVersion: 113-D67301V-073
93
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
94
+ - id: "4"
95
+ maxPower: "300.0"
96
+ mclkRange: 400Mhz - 1600Mhz
97
+ model: "0x740f"
98
+ performanceLevel: auto
99
+ sclkRange: 500Mhz - 1700Mhz
100
+ series: Instinct MI210
101
+ sku: D67301V
102
+ uniqueId: "0x4c21a2ad76408df6"
103
+ vbiosVersion: 113-D67301V-073
104
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
105
+ - id: "0"
106
+ maxPower: "300.0"
107
+ mclkRange: 400Mhz - 1600Mhz
108
+ model: "0x740f"
109
+ performanceLevel: auto
110
+ sclkRange: 500Mhz - 1700Mhz
111
+ series: Instinct MI210
112
+ sku: D67301V
113
+ uniqueId: "0xa8d2c33980704bf2"
114
+ vbiosVersion: 113-D67301V-073
115
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
116
+ - id: "1"
117
+ maxPower: "300.0"
118
+ mclkRange: 400Mhz - 1600Mhz
119
+ model: "0x740f"
120
+ performanceLevel: auto
121
+ sclkRange: 500Mhz - 1700Mhz
122
+ series: Instinct MI210
123
+ sku: D67301V
124
+ uniqueId: "0xd13265721a117b54"
125
+ vbiosVersion: 113-D67301V-073
126
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
127
+ - id: "5"
128
+ maxPower: "300.0"
129
+ mclkRange: 400Mhz - 1600Mhz
130
+ model: "0x740f"
131
+ performanceLevel: auto
132
+ sclkRange: 500Mhz - 1700Mhz
133
+ series: Instinct MI210
134
+ sku: D67301V
135
+ uniqueId: "0x19ee82506963794b"
136
+ vbiosVersion: 113-D67301V-073
137
+ vendor: Advanced Micro Devices, Inc. [AMD/ATI]
138
+ gpu_count: 8
139
+ host: auh7-1b-gpu-316
140
+ memory:
141
+ total: "2434606952448"
142
+ os: Linux-5.15.0-140-generic-x86_64-with-glibc2.35
143
+ program: /vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py
144
+ python: CPython 3.10.18
145
+ root: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_l1_regression/wandb
146
+ slurm:
147
+ cluster_name: ai-04r
148
+ conf: /etc/slurm/slurm.conf
149
+ cpus_on_node: "128"
150
+ gpus_on_node: "8"
151
+ gtids: "0"
152
+ job_account: faculty-acc
153
+ job_cpus_per_node: "128"
154
+ job_end_time: "1760200645"
155
+ job_gid: "2000"
156
+ job_gpus: 0,1,2,3,4,5,6,7
157
+ job_id: "2282"
158
+ job_name: mh_cleandesk_l1_regression
159
+ job_nodelist: auh7-1b-gpu-316
160
+ job_num_nodes: "1"
161
+ job_partition: faculty
162
+ job_qos: xdqos
163
+ job_start_time: "1759941445"
164
+ job_uid: "2013"
165
+ job_user: xiaodan
166
+ jobid: "2282"
167
+ localid: "0"
168
+ nnodes: "1"
169
+ nodeid: "0"
170
+ nodelist: auh7-1b-gpu-316
171
+ nprocs: "1"
172
+ ntasks: "1"
173
+ ntasks_per_node: "1"
174
+ oom_kill_step: "0"
175
+ prio_process: "0"
176
+ procid: "0"
177
+ submit_dir: /vast/users/xiaodan/zhangjian/A1/launch_scripts
178
+ submit_host: auh-1b-cpu-login-001
179
+ task_pid: "1925818"
180
+ tasks_per_node: "1"
181
+ topology_addr: auh7-1b-gpu-316
182
+ topology_addr_pattern: node
183
+ startedAt: "2025-10-08T16:38:31.938958Z"
184
+ writerId: 9zghejqbkg668a368vduhoyzhbv4wgq6
185
+ m: []
186
+ python_version: 3.10.18
187
+ t:
188
+ "1":
189
+ - 1
190
+ - 2
191
+ - 3
192
+ - 5
193
+ - 11
194
+ - 41
195
+ - 49
196
+ - 51
197
+ - 53
198
+ - 63
199
+ - 71
200
+ - 83
201
+ - 95
202
+ - 105
203
+ "2":
204
+ - 1
205
+ - 2
206
+ - 3
207
+ - 5
208
+ - 11
209
+ - 41
210
+ - 49
211
+ - 51
212
+ - 53
213
+ - 63
214
+ - 71
215
+ - 83
216
+ - 95
217
+ - 105
218
+ "3":
219
+ - 2
220
+ - 13
221
+ - 15
222
+ - 16
223
+ - 61
224
+ "4": 3.10.18
225
+ "5": 0.21.4
226
+ "6": 4.56.1
227
+ "10":
228
+ - 19
229
+ "12": 0.21.4
230
+ "13": linux-x86_64
231
+ activation_checkpointing:
232
+ value: whole_layer
233
+ allow_resume:
234
+ value: false
235
+ batch_divisor:
236
+ value: global_batch
237
+ canceled_check_interval:
238
+ value: 50
239
+ checkpoint_dir:
240
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
241
+ compile:
242
+ value: null
243
+ console_log_interval:
244
+ value: 1
245
+ data:
246
+ value:
247
+ dataset: vla_dataset_realworld
248
+ drop_last: true
249
+ for_inference: false
250
+ lerobot_episode_index_end: null
251
+ lerobot_episode_index_start: null
252
+ mixture: null
253
+ multi_modal: torch
254
+ num_workers: 0
255
+ pad: to_max
256
+ persistent_workers: false
257
+ pin_memory: true
258
+ prefetch_factor: null
259
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
260
+ rlds_dataset_name: libero_4_task_suites_no_noops
261
+ rlds_read_threads: 8
262
+ rlds_shuffle_buffer_size: 100000
263
+ rlds_traj_threads: 8
264
+ root_size_mixture: null
265
+ seed: 95818
266
+ sequence_length: 1600
267
+ shuffle: true
268
+ shuffle_messages: false
269
+ split: train
270
+ timeout: 0
271
+ use_proprio: true
272
+ use_wrist_image: true
273
+ device_eval_batch_size:
274
+ value: 4
275
+ device_inf_eval_batch_size:
276
+ value: 16
277
+ device_train_batch_size:
278
+ value: 15
279
+ device_train_grad_accum:
280
+ value: 0
281
+ device_train_microbatch_size:
282
+ value: 16
283
+ dry_run:
284
+ value: false
285
+ early_exit:
286
+ value: false
287
+ epoch:
288
+ value: null
289
+ eval_interval:
290
+ value: 0
291
+ eval_on_load:
292
+ value: false
293
+ eval_subset_num_batches:
294
+ value: -1
295
+ evaluators:
296
+ value:
297
+ - data:
298
+ dataset: vla_dataset_realworld
299
+ drop_last: true
300
+ for_inference: false
301
+ lerobot_episode_index_end: 765
302
+ lerobot_episode_index_start: 353
303
+ mixture: null
304
+ multi_modal: torch
305
+ num_workers: 0
306
+ pad: to_max
307
+ persistent_workers: true
308
+ pin_memory: true
309
+ prefetch_factor: null
310
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
311
+ rlds_dataset_name: libero_4_task_suites_no_noops
312
+ rlds_read_threads: 8
313
+ rlds_shuffle_buffer_size: 256000
314
+ rlds_traj_threads: 8
315
+ root_size_mixture: null
316
+ seed: null
317
+ sequence_length: 1600
318
+ shuffle: false
319
+ shuffle_messages: false
320
+ split: validation
321
+ timeout: 0
322
+ use_proprio: true
323
+ use_wrist_image: true
324
+ device_eval_batch_size: null
325
+ eval_name: null
326
+ label: val
327
+ max_examples: null
328
+ max_new_tokens: 448
329
+ mm_evaluator: null
330
+ save_dir: null
331
+ save_to_checkpoint_dir: false
332
+ skip_if_metrics_cached: true
333
+ subset_num_batches: 64
334
+ extra_steps_after_cancel:
335
+ value: 10
336
+ fast_forward_batches:
337
+ value: null
338
+ force_save_unsharded:
339
+ value: false
340
+ fsdp:
341
+ value:
342
+ hybrid_sharding_num_model_replicas: null
343
+ precision: float
344
+ sharding_strategy: FULL_SHARD
345
+ use_orig_params: true
346
+ wrapping_strategy: by_block_and_size
347
+ ft_connector:
348
+ value: false
349
+ ft_embedding:
350
+ value: lm_head
351
+ ft_llm:
352
+ value: true
353
+ ft_vit:
354
+ value: false
355
+ fused_loss:
356
+ value: null
357
+ gen1_gc_interval:
358
+ value: 1
359
+ global_train_batch_size:
360
+ value: 126
361
+ inf_eval_interval:
362
+ value: -1
363
+ inf_eval_subset_num_batches:
364
+ value: -1
365
+ inf_evaluators:
366
+ value: []
367
+ initial_model_checkpoint:
368
+ value: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
369
+ keep_lr_on_load:
370
+ value: true
371
+ load_model_config:
372
+ value: null
373
+ load_path:
374
+ value: null
375
+ load_path_sharded_checkpointer:
376
+ value: null
377
+ lora:
378
+ value: false
379
+ lora_connector:
380
+ value: false
381
+ lora_llm:
382
+ value: false
383
+ lora_rank:
384
+ value: 8
385
+ lora_vit:
386
+ value: false
387
+ max_duration:
388
+ value: 500000
389
+ max_grad_norm:
390
+ value: 1
391
+ max_grad_norm_ratio:
392
+ value: null
393
+ model:
394
+ value:
395
+ action_dim: 7
396
+ action_head: l1_regression
397
+ action_head_dit_depth: 28
398
+ action_head_dit_hidden_size: 1152
399
+ action_head_dit_num_heads: 16
400
+ action_tokenizer:
401
+ identifier: physical-intelligence/fast
402
+ tokenizer_dir: null
403
+ action_use_left_eef: true
404
+ action_use_mobile_base: false
405
+ activation_type: swiglu
406
+ additional_vocab_size: 128
407
+ always_start_with_space: true
408
+ attention_dropout: 0
409
+ attention_layer_norm: false
410
+ attention_layer_norm_with_affine: true
411
+ attention_type: sdpa
412
+ bias_for_layer_norm: null
413
+ block_group_size: 1
414
+ block_type: sequential
415
+ clip_qkv: null
416
+ crop_mode: overlap-and-resize-c2
417
+ d_model: 3584
418
+ default_inference_len: 65
419
+ embedding_dropout: 0
420
+ embedding_size: 152064
421
+ ff_out_size: null
422
+ fix_image_padding: true
423
+ float32_attention: true
424
+ head_dim: null
425
+ horizon: 8
426
+ image_feature_dropout: 0
427
+ image_padding_embed: pad_and_partial_pad
428
+ image_pooling_2d: attention_meanq
429
+ image_pooling_h: 2
430
+ image_pooling_w: 2
431
+ image_projector: mlp
432
+ include_bias: false
433
+ init_cutoff_factor: null
434
+ init_device: null
435
+ init_fn: normal
436
+ init_std: 0.02
437
+ initializer_range: 0.02
438
+ layer_norm_eps: 1e-06
439
+ layer_norm_type: rms
440
+ layer_norm_with_affine: true
441
+ llm_causal_attention: false
442
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
443
+ low_cpu_fsdp: true
444
+ max_crops: 12
445
+ max_position_embeddings: null
446
+ max_sequence_length: 4096
447
+ message_formatting: role
448
+ mlp_hidden_size: 37888
449
+ mlp_ratio: 4
450
+ moe_capacity_factor: 1.25
451
+ moe_dropless: true
452
+ moe_interleave: false
453
+ moe_lbl_in_fp32: false
454
+ moe_log_expert_assignment: false
455
+ moe_loss_weight: 0.1
456
+ moe_mlp_impl: sparse
457
+ moe_num_experts: 8
458
+ moe_shared_expert: false
459
+ moe_top_k: 2
460
+ moe_zloss_weight: null
461
+ multi_annotation_weighting: root_subsegments
462
+ n_heads: 28
463
+ n_kv_heads: 4
464
+ n_layers: 28
465
+ new_embedding_init_range: 0.02
466
+ norm_after: false
467
+ normalize_input_embeds: false
468
+ num_diffusion_inference_steps: 30
469
+ num_diffusion_steps: 1000
470
+ overlap_margins:
471
+ - 4
472
+ - 4
473
+ pad_tokenizer: true
474
+ pad_value: 0
475
+ precision: amp_bf16
476
+ prompt_type: uber_model
477
+ qkv_bias: true
478
+ residual_dropout: 0.1
479
+ response_residual_dropout: 0
480
+ rope: true
481
+ rope_full_precision: true
482
+ rope_theta: 1e+06
483
+ scale_logits: false
484
+ system_prompt_kind: demo_or_style
485
+ tokenizer:
486
+ identifier: Qwen/Qwen2-7B
487
+ tokenizer_dir: null
488
+ use_col_tokens: true
489
+ use_position_ids: true
490
+ use_proprio: true
491
+ vision_backbone:
492
+ attention_dropout: 0
493
+ fsdp_wrap: false
494
+ image_default_input_size:
495
+ - 336
496
+ - 336
497
+ image_dropout_rate: 0
498
+ image_emb_dim: 1024
499
+ image_head_dim: 64
500
+ image_mlp_activations: quick_gelu
501
+ image_mlp_dim: 4096
502
+ image_model_type: openai
503
+ image_norm_eps: 1e-05
504
+ image_num_heads: 16
505
+ image_num_key_value_heads: 16
506
+ image_num_layers: 23
507
+ image_num_pos: 577
508
+ image_patch_size: 14
509
+ image_pos_patch_size: 14
510
+ initializer_range: 0.02
511
+ residual_dropout: 0
512
+ resize_mode: default
513
+ vit_layers:
514
+ - -2
515
+ - -9
516
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
517
+ vocab_size: 152064
518
+ weight_tying: false
519
+ multi_component_grad_norm:
520
+ value: true
521
+ no_pre_train_checkpoint:
522
+ value: true
523
+ optimizer:
524
+ value:
525
+ betas:
526
+ - 0.9
527
+ - 0.95
528
+ connector_betas:
529
+ - 0.9
530
+ - 0.95
531
+ connector_eps: 1e-06
532
+ connector_learning_rate: 0.0002
533
+ connector_weight_decay: 0
534
+ eps: 1e-05
535
+ learning_rate: 0.0001
536
+ llm_betas:
537
+ - 0.9
538
+ - 0.95
539
+ llm_eps: 1e-06
540
+ llm_learning_rate: 5e-05
541
+ llm_weight_decay: 0
542
+ metrics_log_interval: 20
543
+ name: adamw
544
+ vit_betas:
545
+ - 0.9
546
+ - 0.95
547
+ vit_eps: 1e-06
548
+ vit_learning_rate: 6e-06
549
+ vit_weight_decay: 0
550
+ weight_decay: 0.01
551
+ precision:
552
+ value: amp_bf16
553
+ python_profiling:
554
+ value: false
555
+ remote_save_folder:
556
+ value: null
557
+ reset_dataloader_state:
558
+ value: false
559
+ reset_optimizer_state:
560
+ value: false
561
+ reset_trainer_state:
562
+ value: false
563
+ restore_dataloader:
564
+ value: true
565
+ run_name:
566
+ value: cleandesk_20251008_163754
567
+ save_dataloader_state:
568
+ value: false
569
+ save_folder:
570
+ value: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_l1_regression
571
+ save_interval:
572
+ value: 500
573
+ save_interval_action_head:
574
+ value: 500
575
+ save_interval_ephemeral:
576
+ value: null
577
+ save_interval_unsharded:
578
+ value: 500
579
+ save_num_action_head_checkpoints_to_keep:
580
+ value: 2
581
+ save_num_checkpoints_to_keep:
582
+ value: 1
583
+ save_num_unsharded_checkpoints_to_keep:
584
+ value: 1
585
+ save_overwrite:
586
+ value: true
587
+ scheduler:
588
+ value:
589
+ alpha_f: 0.1
590
+ connector_t_warmup: 200
591
+ grad_clip_warmup_factor: null
592
+ grad_clip_warmup_steps: null
593
+ llm_t_warmup: 2000
594
+ name: multimodal
595
+ t_max: null
596
+ t_warmup: 100
597
+ units: steps
598
+ vit_t_warmup: 2000
599
+ warmup_min_lr: 0
600
+ seed:
601
+ value: 6198
602
+ sharded_checkpointer:
603
+ value: torch_legacy
604
+ softmax_auxiliary_loss:
605
+ value: true
606
+ softmax_auxiliary_loss_scale:
607
+ value: 0.0001
608
+ speed_monitor:
609
+ value:
610
+ gpu_flops_available: null
611
+ window_size: 20
612
+ stop_after:
613
+ value: null
614
+ stop_at:
615
+ value: 500000
616
+ time_limit:
617
+ value: null
618
+ torch_profiling:
619
+ value: false
620
+ train_exit_random_layer:
621
+ value: false
622
+ use_lora:
623
+ value: false
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/output.log ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: Detected [openai] in use.
2
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
4
+ 10/08 [16:38:34] WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
5
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
6
+ warnings.warn( # warn only once
7
+
8
+ ****** vla_cfg: {'datasets': {'rlds': {'name': None, 'path': None, 'weight': 1.0, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [['/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk', 8, 'bounds']], 'open-source-real-world': {'rlds': {'name': 'a1_real_world', 'path': '/vast/users/xiaodan/zhangjian/datasets/OXE', 'weight': 8, 'action_proprio_normalization_type': 'bounds_q99', 'image_augmentation': False}, 'lerobot': [], 'agibot': {'path': None, 'weight': 8, 'action_proprio_normalization_type': None}}}, 'model': {'action_head': {'action_dim': 16, 'proprio_dim': 16, 'num_actions_chunk': 8, 'action_tokens_mapping': {'left_end_effector': 8, 'right_end_effector': 8}, 'use_left_eef': True, 'use_mobile_base': False}}}
9
+ ****** Skip RLDS main; path not found: None
10
+ ****** start build LeRobot main...
11
+ build_tokenizer, cache_dir None tokenizer_dir None
12
+ 10/08 [16:38:35] INFO | >> Padding tokenizer with 418 tokens tokenizer.py:130
13
+ 10/08 [16:38:36] INFO | >> Loading train dataset: vla_dataset_realworld/train __init__.py:436
14
+ ****** before LeRobot dataset...
15
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/dataset/realworld/Lerobot_CleanDesk
16
+ ****** length of the dataset: 72641
17
+ 10/08 [16:38:42] INFO | >> build_rlds_train_dataset: Loading train dataset: vla_dataset_realworld/train __init__.py:519
18
+ ****** Import RLDSBatchTransform, RLDSDataset successfully.
19
+ ****** before RLDS dataset...
20
+ ****** data_config.rlds_dataset_name: a1_real_world
21
+ ****** data_config.rlds_data_root_dir: /vast/users/xiaodan/zhangjian/datasets/OXE
22
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc287856f0>
23
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
24
+ 100%|██████████| 87212/87212 [00:43<00:00, 2004.76it/s]
25
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc2271f640>
26
+ 10/08 [16:39:30] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
27
+ /vast/users/xiaodan/zhangjian/datasets/OXE/jaco_play/0.1.0/dataset_statistics_e081d4716a3da95df91c79d661ae59fa26a43da49db4bf8d716b622b56
28
+ 3b0ea3.json.
29
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc27894400>
30
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
31
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_cable_routing/0.1.0/dataset_statistics_08cb4c5b7c5e6c035fc84ea85b2d54c0c46ad608a8763
32
+ 4ebb18374088d23cd76.json.
33
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc2050d510>
34
+ 10/08 [16:39:31] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
35
+ /vast/users/xiaodan/zhangjian/datasets/OXE/viola/0.1.0/dataset_statistics_2415d8f7de73c8761fedd7c2a9590667fb0d3fdd26664bf4c100222e5cdb89
36
+ b9.json.
37
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc2769a050>
38
+ INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
39
+ 100%|██████████| 1000/1000 [00:05<00:00, 184.79it/s]
40
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc27703af0>
41
+ 10/08 [16:39:38] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
42
+ /vast/users/xiaodan/zhangjian/datasets/OXE/austin_buds_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_ccecde24cc01793b221
43
+ 4eb0c4c5d7cc0e3ccc623db99bd892b83552b20decfb7.json.
44
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc227130d0>
45
+ 10/08 [16:39:39] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
46
+ 100%|██████████| 456/456 [00:24<00:00, 18.55it/s]
47
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc107ec1c0>
48
+ 10/08 [16:40:06] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
49
+ 100%|██████████| 5100/5100 [00:57<00:00, 88.36it/s]
50
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc2051ccd0>
51
+ 10/08 [16:41:08] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
52
+ 100%|██████████| 240/240 [00:08<00:00, 29.80it/s]
53
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc00979a20>
54
+ 10/08 [16:41:20] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
55
+ /vast/users/xiaodan/zhangjian/datasets/OXE/austin_sirius_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_cb2e0273f80029a19
56
+ dc3dbb3a3a4118a5598e7bff3ff0245891255825b04b42a.json.
57
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf9b9c7f0>
58
+ INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
59
+ /vast/users/xiaodan/zhangjian/datasets/OXE/dlr_edan_shared_control_converted_externally_to_rlds/0.1.0/dataset_statistics_b8984563fc3e7ea
60
+ c0803c667ef58c9deaf2e747683568306ea1d83505d532a76.json.
61
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc101d3430>
62
+ 10/08 [16:41:21] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
63
+ 100%|██████████| 1500/1500 [00:02<00:00, 664.65it/s]
64
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc008400d0>
65
+ 10/08 [16:41:24] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
66
+ /vast/users/xiaodan/zhangjian/datasets/OXE/berkeley_fanuc_manipulation/0.1.0/dataset_statistics_a98d349d0364668095ea3ca38c6785e94f35e5e5
67
+ 8e234c88fac83775a923b0d0.json.
68
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf95ee740>
69
+ 10/08 [16:41:25] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
70
+ 100%|██████████| 43264/43264 [00:32<00:00, 1321.74it/s]
71
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc0092a200>
72
+ 10/08 [16:42:02] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
73
+ /vast/users/xiaodan/zhangjian/datasets/OXE/roboturk/0.1.0/dataset_statistics_3aa821e17a2937f941d4102cfadcb1154853cb45dcec07ccc66893b01f6
74
+ f1b40.json.
75
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc1013b7c0>
76
+ 10/08 [16:42:03] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
77
+ 100%|██████████| 1003/1003 [00:00<00:00, 1714.08it/s]
78
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf9480880>
79
+ 10/08 [16:42:04] INFO | >> [*] Loading existing dataset statistics from data_utils.py:200
80
+ /vast/users/xiaodan/zhangjian/datasets/OXE/ucsd_kitchen_dataset_converted_externally_to_rlds/0.1.0/dataset_statistics_1f1a5f310a2d5a6edc
81
+ 0e217370e135c8c8598290f11f57025037adcb0d033926.json.
82
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbfdb13af0>
83
+ 10/08 [16:42:05] INFO | >> [*] Computing dataset statistics. This may take a bit, but should only need to happen once. data_utils.py:227
84
+ 100%|██████████| 631/631 [00:00<00:00, 1276.86it/s]
85
+
86
+ ######################################################################################
87
+ # Loading the following 18 datasets (incl. sampling weight): #
88
+ # fractal20220817_data: ====================================================0.529250 #
89
+ # jaco_play: ===============================================================0.010898 #
90
+ # berkeley_cable_routing: ==================================================0.005916 #
91
+ # viola: ===================================================================0.021337 #
92
+ # berkeley_autolab_ur5: ====================================================0.027379 #
93
+ # austin_buds_dataset_converted_externally_to_rlds: ========================0.004768 #
94
+ # nyu_franka_play_dataset_converted_externally_to_rlds: ====================0.018817 #
95
+ # furniture_bench_dataset_converted_externally_to_rlds: ====================0.055185 #
96
+ # austin_sailor_dataset_converted_externally_to_rlds: ======================0.049354 #
97
+ # austin_sirius_dataset_converted_externally_to_rlds: ======================0.039129 #
98
+ # dlr_edan_shared_control_converted_externally_to_rlds: ====================0.001248 #
99
+ # utaustin_mutex: ==========================================================0.050583 #
100
+ # berkeley_fanuc_manipulation: =============================================0.017504 #
101
+ # bc_z: ====================================================================0.168166 #
102
+ # roboturk: ================================================================0.000131 #
103
+ # toto: ====================================================================0.000228 #
104
+ # ucsd_kitchen_dataset_converted_externally_to_rlds: =======================0.000006 #
105
+ # iamlab_cmu_pickup_insert_converted_externally_to_rlds: ===================0.000102 #
106
+ ######################################################################################
107
+
108
+ 10/08 [16:42:06] INFO | >> [*] Threads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:563
109
+ INFO | >> [*] Reads per Dataset: [14 1 1 1 1 1 1 2 1 1 1 1 1 5 1 1 1 1] dataset.py:564
110
+ INFO | >> [*] Constructing datasets... dataset.py:567
111
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc00840040>
112
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc18089900>
113
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc27699c90>
114
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf9b33400>
115
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc27701660>
116
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc1003acb0>
117
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbfff5f9a0>
118
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc00373ac0>
119
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbffe0ad10>
120
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbfdd10bb0>
121
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf893b7c0>
122
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf81a7370>
123
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf80c0760>
124
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf3fd0bb0>
125
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf3e98a30>
126
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc27739960>
127
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efbf947d4e0>
128
+ 214************** <tensorflow_datasets.core.read_only_builder.ReadOnlyBuilder object at 0x7efc107bc850>
129
+ 10/08 [16:42:11] INFO | >> [*] Applying frame transforms on dataset... dataset.py:607
130
+ ****** after RLDSDataset initialization!
131
+ ****** length of the dataset: 7154275
132
+ ****** Build rlds train dataset: IterableDatasetWrapper successfully.
133
+ ****** path: None
134
+ ****** Skip AgiBotWorld-Alpha open-source-real-world; path not found: None
135
+ ****** After build vla train dataset...
136
+ ****** iterable_sources: [<olmo.data.dataset.IterableDatasetWrapper object at 0x7efc225c0040>, <olmo.data.dataset.IterableDatasetWrapper object at 0x7efc22713f70>]
137
+ ****** Before build mixed iterable dataset...
138
+ ****** Build vla train dataloader successfully!
139
+ ************************* Build train_dataloader successful!
140
+ ************************* Before build_inf_evaluators
141
+ WARNING | >> /vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No warnings.py:109
142
+ device id is provided via `init_process_group` or `barrier `. Using the current device set by the user.
143
+ warnings.warn( # warn only once
144
+
145
+ ************************* Build evaluators successful!
146
+ ************************* Early exit flags: early_exit=False
147
+ PROPRIO_DIM 16 does not match ACTION_DIM 16 for AffordVLA
148
+ ************************* Initialize model successful!
149
+ ***** state_dict_path: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924/model.pt
150
+ ***** Load checkpoint successful!
151
+ missing keys: ['action_head.model.layer_norm1.weight', 'action_head.model.layer_norm1.bias', 'action_head.model.fc1.weight', 'action_head.model.fc1.bias', 'action_head.model.mlp_resnet_blocks.0.ffn.0.weight', 'action_head.model.mlp_resnet_blocks.0.ffn.0.bias', 'action_head.model.mlp_resnet_blocks.0.ffn.1.weight', 'action_head.model.mlp_resnet_blocks.0.ffn.1.bias', 'action_head.model.mlp_resnet_blocks.1.ffn.0.weight', 'action_head.model.mlp_resnet_blocks.1.ffn.0.bias', 'action_head.model.mlp_resnet_blocks.1.ffn.1.weight', 'action_head.model.mlp_resnet_blocks.1.ffn.1.bias', 'action_head.model.layer_norm2.weight', 'action_head.model.layer_norm2.bias', 'action_head.model.fc2.weight', 'action_head.model.fc2.bias', 'proprio_projector.fc1.weight', 'proprio_projector.fc1.bias', 'proprio_projector.fc2.weight', 'proprio_projector.fc2.bias']
152
+ unexpected keys: []
153
+ ************************* Initialize model successful!
154
+ ************************* Before FSDP model wrapping
155
+ ************************* FSDP model wrapping successful!
156
+ ************************* Before building optimizer and scheduler
157
+ 10/08 [16:43:46] INFO | >> Constructing optimizer with 2 param groups optim.py:1283
158
+ **************************************************
159
+ After building optimizer and scheduler and model, before training, peak GPU memory (MB): 35614
160
+ ************************* VLATrainer initialized successfully!
161
+ ************************* Before trainer.fit()
162
+ Pre-train system metrics
163
+ System/Peak GPU Memory (MB)=35,614
164
+ 10/08 [16:43:56] WARNING | >> /vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py:200: UserWarning: To copy construct from a tensor, it is recommended to use warnings.py:109
165
+ sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
166
+ timestep_list = [torch.tensor(ex["timestep"], dtype=torch.int64) for ex in batch]
167
+
168
+ !!!Training failed: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
169
+ Traceback (most recent call last):
170
+ File "/vast/users/xiaodan/zhangjian/A1/scripts/train_for_action.py", line 593, in main
171
+ trainer.fit()
172
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/train.py", line 2284, in fit
173
+ for batch in self.train_loader:
174
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
175
+ data = self._next_data()
176
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
177
+ data = self._dataset_fetcher.fetch(index) # may raise StopIteration
178
+ File "/vast/users/xiaodan/miniconda3/envs/a1/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 43, in fetch
179
+ return self.collate_fn(data)
180
+ File "/vast/users/xiaodan/zhangjian/A1/olmo/data/collator.py", line 201, in __call__
181
+ out['timestep'] = torch.stack(timestep_list, dim=0)
182
+ RuntimeError: stack expects each tensor to be equal size, but got [] at entry 0 and [1] at entry 1
183
+ wandb: WARNING The `quiet` argument to `wandb.run.finish()` is deprecated, use `wandb.Settings(quiet=...)` to set this instead.
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/requirements.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai2-molmo==0.0.0
2
+ astunparse==1.6.3
3
+ flatbuffers==25.2.10
4
+ gast==0.6.0
5
+ google-pasta==0.2.0
6
+ h5py==3.14.0
7
+ libclang==18.1.1
8
+ Markdown==3.9
9
+ namex==0.1.0
10
+ opt_einsum==3.4.0
11
+ optree==0.17.0
12
+ tensorboard-data-server==0.7.2
13
+ tensorflow-io-gcs-filesystem==0.37.1
14
+ termcolor==3.1.0
15
+ Werkzeug==3.1.3
16
+ Brotli==1.1.0
17
+ Farama-Notifications==0.0.4
18
+ MarkupSafe==2.1.5
19
+ PyYAML==6.0.2
20
+ absl-py==2.3.1
21
+ accelerate==1.10.1
22
+ ai2-molmo==0.0.0
23
+ aiofiles==24.1.0
24
+ aiohappyeyeballs==2.6.1
25
+ aiohttp==3.12.15
26
+ aiosignal==1.4.0
27
+ annotated-types==0.7.0
28
+ antlr4-python3-runtime==4.9.3
29
+ anyio==4.10.0
30
+ array_record==0.8.1
31
+ async-timeout==5.0.1
32
+ attrs==25.3.0
33
+ av==15.1.0
34
+ backports.tarfile==1.2.0
35
+ beaker-gantry==3.2.0
36
+ beaker-py==2.5.0
37
+ black==23.12.1
38
+ blinker==1.9.0
39
+ boltons==25.0.0
40
+ boto3==1.40.33
41
+ botocore==1.40.33
42
+ build==1.3.0
43
+ cached_path==1.7.3
44
+ cached-property==2.0.1
45
+ cachetools==5.5.2
46
+ certifi==2025.8.3
47
+ cffi==2.0.0
48
+ charset-normalizer==3.4.3
49
+ click==8.2.1
50
+ click-help-colors==0.9.4
51
+ click-option-group==0.5.7
52
+ cloudpickle==3.1.1
53
+ cmake==4.1.0
54
+ contourpy==1.3.2
55
+ cryptography==46.0.1
56
+ cycler==0.12.1
57
+ dataclass-extensions==0.2.3
58
+ datasets==3.6.0
59
+ decorator==5.2.1
60
+ deepdiff==8.6.1
61
+ diffusers==0.35.1
62
+ dill==0.3.8
63
+ distro==1.9.0
64
+ dlimp==0.0.1
65
+ dm-tree==0.1.9
66
+ docutils==0.22.1
67
+ draccus==0.10.0
68
+ editdistance==0.8.1
69
+ einops==0.8.1
70
+ einops-exts==0.0.4
71
+ et_xmlfile==2.0.0
72
+ etils==1.13.0
73
+ evdev==1.9.2
74
+ exceptiongroup==1.3.0
75
+ face==24.0.0
76
+ fastapi==0.116.2
77
+ ffmpy==0.6.1
78
+ fiddle==0.3.0
79
+ filelock==3.13.1
80
+ Flask==3.1.2
81
+ fonttools==4.60.0
82
+ frozenlist==1.7.0
83
+ fsspec==2023.9.2
84
+ ftfy==6.3.1
85
+ gcsfs==2023.9.2
86
+ gitdb==4.0.12
87
+ GitPython==3.1.45
88
+ glom==24.11.0
89
+ google-api-core==2.25.1
90
+ google-auth==2.40.3
91
+ google-auth-oauthlib==1.2.2
92
+ google-cloud-core==2.4.3
93
+ google-cloud-storage==2.19.0
94
+ google-crc32c==1.7.1
95
+ google-resumable-media==2.7.2
96
+ googleapis-common-protos==1.70.0
97
+ gradio==5.46.0
98
+ gradio_client==1.13.0
99
+ graphviz==0.21
100
+ groovy==0.1.2
101
+ grpcio==1.75.0
102
+ gymnasium==0.29.1
103
+ h11==0.16.0
104
+ hf_transfer==0.1.9
105
+ hf-xet==1.1.10
106
+ httpcore==1.0.9
107
+ httpx==0.28.1
108
+ huggingface-hub==0.35.0
109
+ id==1.5.0
110
+ idna==3.10
111
+ imageio==2.37.0
112
+ imageio-ffmpeg==0.6.0
113
+ importlib_metadata==8.7.0
114
+ importlib_resources==6.5.2
115
+ iniconfig==2.1.0
116
+ inquirerpy==0.3.4
117
+ isort==5.12.0
118
+ itsdangerous==2.2.0
119
+ jaraco.classes==3.4.0
120
+ jaraco.context==6.0.1
121
+ jaraco.functools==4.3.0
122
+ jeepney==0.9.0
123
+ Jinja2==3.1.4
124
+ jiter==0.11.0
125
+ jmespath==1.0.1
126
+ joblib==1.5.2
127
+ jsonlines==4.0.0
128
+ keras==2.15.0
129
+ keyring==25.6.0
130
+ kiwisolver==1.4.9
131
+ latex2sympy2_extended==1.10.2
132
+ lerobot==0.3.4
133
+ Levenshtein==0.27.1
134
+ libcst==1.8.4
135
+ lightning-utilities==0.15.2
136
+ markdown-it-py==4.0.0
137
+ math-verify==0.8.0
138
+ matplotlib==3.10.6
139
+ mdurl==0.1.2
140
+ mergedeep==1.3.4
141
+ ml-dtypes==0.2.0
142
+ ml_dtypes==0.5.3
143
+ more-itertools==10.8.0
144
+ mpmath==1.3.0
145
+ msgspec==0.19.0
146
+ multidict==6.6.4
147
+ multiprocess==0.70.16
148
+ mypy==1.3.0
149
+ mypy_extensions==1.1.0
150
+ necessary==0.4.3
151
+ networkx==3.3
152
+ nh3==0.3.0
153
+ nltk==3.9.1
154
+ numpy==1.26.4
155
+ oauthlib==3.3.1
156
+ omegaconf==2.3.0
157
+ openai==1.108.0
158
+ opencv-python-headless==4.12.0.88
159
+ OpenEXR==3.4.0
160
+ openpyxl==3.1.5
161
+ orderly-set==5.5.0
162
+ orjson==3.11.3
163
+ packaging==25.0
164
+ pandas==2.3.2
165
+ pathspec==0.12.1
166
+ petname==2.6
167
+ pfzy==0.3.4
168
+ pillow==11.0.0
169
+ pip==25.2
170
+ platformdirs==4.4.0
171
+ pluggy==1.6.0
172
+ promise==2.3
173
+ prompt_toolkit==3.0.52
174
+ propcache==0.3.2
175
+ proto-plus==1.26.1
176
+ protobuf==4.21.12
177
+ protobuf==6.32.1
178
+ psutil==7.1.0
179
+ pyarrow==21.0.0
180
+ pyasn1==0.6.1
181
+ pyasn1_modules==0.4.2
182
+ pycparser==2.23
183
+ pydantic==2.11.9
184
+ pydantic_core==2.33.2
185
+ pydub==0.25.1
186
+ Pygments==2.19.2
187
+ pynput==1.8.1
188
+ pyparsing==3.2.4
189
+ pyproject_hooks==1.2.0
190
+ pyserial==3.5
191
+ pytest==8.4.2
192
+ pytest-sphinx==0.6.3
193
+ python-dateutil==2.9.0.post0
194
+ python-Levenshtein==0.27.1
195
+ python-multipart==0.0.20
196
+ python-xlib==0.33
197
+ pytorch-triton-rocm==3.4.0
198
+ pytz==2025.2
199
+ pyyaml-include==1.4.1
200
+ RapidFuzz==3.14.1
201
+ readme_renderer==44.0
202
+ regex==2025.9.1
203
+ requests==2.32.5
204
+ requests-oauthlib==2.0.0
205
+ requests-toolbelt==1.0.0
206
+ requirements-parser==0.13.0
207
+ rerun-sdk==0.22.1
208
+ rfc3986==2.0.0
209
+ rich==13.9.4
210
+ rsa==4.9.1
211
+ ruff==0.13.0
212
+ s3transfer==0.14.0
213
+ safehttpx==0.1.6
214
+ safetensors==0.6.2
215
+ scikit-learn==1.7.2
216
+ scipy==1.15.3
217
+ SecretStorage==3.4.0
218
+ semantic-version==2.10.0
219
+ sentencepiece==0.2.1
220
+ sentry-sdk==2.38.0
221
+ setuptools==78.1.1
222
+ shellingham==1.5.4
223
+ six==1.17.0
224
+ smart_open==7.3.1
225
+ smashed==0.21.5
226
+ smmap==5.0.2
227
+ sniffio==1.3.1
228
+ starlette==0.48.0
229
+ sympy==1.13.3
230
+ tensorboard==2.15.2
231
+ tensorboard==2.19.0
232
+ tensorflow==2.15.0
233
+ tensorflow-addons==0.23.0
234
+ tensorflow-datasets==4.9.3
235
+ tensorflow-estimator==2.15.0
236
+ tensorflow-graphics==2021.12.3
237
+ tensorflow-metadata==1.17.2
238
+ threadpoolctl==3.6.0
239
+ timm==1.0.19
240
+ tokenizers==0.22.0
241
+ toml==0.10.2
242
+ tomli==2.2.1
243
+ tomlkit==0.13.3
244
+ torch==2.8.0+rocm6.4
245
+ torchcodec==0.5
246
+ torchmetrics==1.8.2
247
+ torchvision==0.23.0+rocm6.4
248
+ tqdm==4.67.1
249
+ transformers==4.56.1
250
+ trimesh==4.8.2
251
+ trouting==0.3.3
252
+ twine==6.2.0
253
+ typeguard==2.13.3
254
+ typer==0.17.4
255
+ typing_extensions==4.15.0
256
+ typing-inspect==0.9.0
257
+ typing-inspection==0.4.1
258
+ tzdata==2025.2
259
+ urllib3==2.5.0
260
+ uvicorn==0.35.0
261
+ wandb==0.21.4
262
+ wcwidth==0.2.13
263
+ websockets==15.0.1
264
+ wheel==0.45.1
265
+ wrapt==1.14.2
266
+ xxhash==3.5.0
267
+ yarl==1.20.1
268
+ zipp==3.23.0
269
+ lerobot==0.3.4
270
+ minLoRA==0.1.0
271
+ autocommand==2.2.2
272
+ backports.tarfile==1.2.0
273
+ importlib_metadata==8.0.0
274
+ inflect==7.3.1
275
+ jaraco.collections==5.1.0
276
+ jaraco.context==5.3.0
277
+ jaraco.functools==4.0.1
278
+ jaraco.text==3.12.1
279
+ more-itertools==10.3.0
280
+ packaging==24.2
281
+ platformdirs==4.2.2
282
+ tomli==2.0.1
283
+ typeguard==4.3.0
284
+ typing_extensions==4.12.2
285
+ wheel==0.45.1
286
+ zipp==3.19.2
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/wandb-metadata.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-140-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.18",
4
+ "startedAt": "2025-10-08T16:38:31.938958Z",
5
+ "args": [
6
+ "qwen2_7b",
7
+ "save_folder=/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_l1_regression",
8
+ "--vision_backbone",
9
+ "openai",
10
+ "--action_head",
11
+ "l1_regression",
12
+ "--seq_len",
13
+ "1600",
14
+ "--ft_llm",
15
+ "--checkpoint",
16
+ "/vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924",
17
+ "--device_train_microbatch_size",
18
+ "16",
19
+ "--global_batch_size",
20
+ "126",
21
+ "--dataset",
22
+ "vla_dataset_realworld",
23
+ "--llm_learning_rate",
24
+ "5e-5",
25
+ "--wandb_entity",
26
+ "henryeap",
27
+ "--wandb_project",
28
+ "a1-realworld",
29
+ "--wandb_run_name",
30
+ "cleandesk",
31
+ "--real_world_vla_config_path",
32
+ "vla_config_realworld/vla_config_cleandesk.yaml",
33
+ "--save_overwrite"
34
+ ],
35
+ "program": "/vast/users/xiaodan/zhangjian/A1/launch_scripts/train_vla.py",
36
+ "codePath": "launch_scripts/train_vla.py",
37
+ "codePathLocal": "launch_scripts/train_vla.py",
38
+ "git": {
39
+ "remote": "https://github.com/Spatialtemporal-AI/A1.git",
40
+ "commit": "49712a42d21a8c739a16ba5eeaec4a0d7b29ab80"
41
+ },
42
+ "email": "ihenrykwok@outlook.com",
43
+ "root": "/vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/cleandesk_l1_regression/wandb",
44
+ "host": "auh7-1b-gpu-316",
45
+ "executable": "/vast/users/xiaodan/miniconda3/envs/a1/bin/python3.10",
46
+ "cpu_count": 64,
47
+ "cpu_count_logical": 128,
48
+ "gpu": "Instinct MI210",
49
+ "gpu_count": 8,
50
+ "disk": {
51
+ "/": {
52
+ "total": "470343073792",
53
+ "used": "50668195840"
54
+ }
55
+ },
56
+ "memory": {
57
+ "total": "2434606952448"
58
+ },
59
+ "gpu_amd": [
60
+ {
61
+ "id": "3",
62
+ "uniqueId": "0x62b25d667064a7ff",
63
+ "vbiosVersion": "113-D67301V-073",
64
+ "performanceLevel": "auto",
65
+ "maxPower": "300.0",
66
+ "series": "Instinct MI210",
67
+ "model": "0x740f",
68
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
69
+ "sku": "D67301V",
70
+ "sclkRange": "500Mhz - 1700Mhz",
71
+ "mclkRange": "400Mhz - 1600Mhz"
72
+ },
73
+ {
74
+ "id": "6",
75
+ "uniqueId": "0xbdb93fac1aa97618",
76
+ "vbiosVersion": "113-D67301V-073",
77
+ "performanceLevel": "auto",
78
+ "maxPower": "300.0",
79
+ "series": "Instinct MI210",
80
+ "model": "0x740f",
81
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
82
+ "sku": "D67301V",
83
+ "sclkRange": "500Mhz - 1700Mhz",
84
+ "mclkRange": "400Mhz - 1600Mhz"
85
+ },
86
+ {
87
+ "id": "7",
88
+ "uniqueId": "0x850c5a1ff5d005be",
89
+ "vbiosVersion": "113-D67301V-073",
90
+ "performanceLevel": "auto",
91
+ "maxPower": "300.0",
92
+ "series": "Instinct MI210",
93
+ "model": "0x740f",
94
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
95
+ "sku": "D67301V",
96
+ "sclkRange": "500Mhz - 1700Mhz",
97
+ "mclkRange": "400Mhz - 1600Mhz"
98
+ },
99
+ {
100
+ "id": "2",
101
+ "uniqueId": "0x7a3e2781f4182456",
102
+ "vbiosVersion": "113-D67301V-073",
103
+ "performanceLevel": "auto",
104
+ "maxPower": "300.0",
105
+ "series": "Instinct MI210",
106
+ "model": "0x740f",
107
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
108
+ "sku": "D67301V",
109
+ "sclkRange": "500Mhz - 1700Mhz",
110
+ "mclkRange": "400Mhz - 1600Mhz"
111
+ },
112
+ {
113
+ "id": "4",
114
+ "uniqueId": "0x4c21a2ad76408df6",
115
+ "vbiosVersion": "113-D67301V-073",
116
+ "performanceLevel": "auto",
117
+ "maxPower": "300.0",
118
+ "series": "Instinct MI210",
119
+ "model": "0x740f",
120
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
121
+ "sku": "D67301V",
122
+ "sclkRange": "500Mhz - 1700Mhz",
123
+ "mclkRange": "400Mhz - 1600Mhz"
124
+ },
125
+ {
126
+ "id": "0",
127
+ "uniqueId": "0xa8d2c33980704bf2",
128
+ "vbiosVersion": "113-D67301V-073",
129
+ "performanceLevel": "auto",
130
+ "maxPower": "300.0",
131
+ "series": "Instinct MI210",
132
+ "model": "0x740f",
133
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
134
+ "sku": "D67301V",
135
+ "sclkRange": "500Mhz - 1700Mhz",
136
+ "mclkRange": "400Mhz - 1600Mhz"
137
+ },
138
+ {
139
+ "id": "1",
140
+ "uniqueId": "0xd13265721a117b54",
141
+ "vbiosVersion": "113-D67301V-073",
142
+ "performanceLevel": "auto",
143
+ "maxPower": "300.0",
144
+ "series": "Instinct MI210",
145
+ "model": "0x740f",
146
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
147
+ "sku": "D67301V",
148
+ "sclkRange": "500Mhz - 1700Mhz",
149
+ "mclkRange": "400Mhz - 1600Mhz"
150
+ },
151
+ {
152
+ "id": "5",
153
+ "uniqueId": "0x19ee82506963794b",
154
+ "vbiosVersion": "113-D67301V-073",
155
+ "performanceLevel": "auto",
156
+ "maxPower": "300.0",
157
+ "series": "Instinct MI210",
158
+ "model": "0x740f",
159
+ "vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
160
+ "sku": "D67301V",
161
+ "sclkRange": "500Mhz - 1700Mhz",
162
+ "mclkRange": "400Mhz - 1600Mhz"
163
+ }
164
+ ],
165
+ "slurm": {
166
+ "cluster_name": "ai-04r",
167
+ "conf": "/etc/slurm/slurm.conf",
168
+ "cpus_on_node": "128",
169
+ "gpus_on_node": "8",
170
+ "gtids": "0",
171
+ "job_account": "faculty-acc",
172
+ "job_cpus_per_node": "128",
173
+ "job_end_time": "1760200645",
174
+ "job_gid": "2000",
175
+ "job_gpus": "0,1,2,3,4,5,6,7",
176
+ "job_id": "2282",
177
+ "job_name": "mh_cleandesk_l1_regression",
178
+ "job_nodelist": "auh7-1b-gpu-316",
179
+ "job_num_nodes": "1",
180
+ "job_partition": "faculty",
181
+ "job_qos": "xdqos",
182
+ "job_start_time": "1759941445",
183
+ "job_uid": "2013",
184
+ "job_user": "xiaodan",
185
+ "jobid": "2282",
186
+ "localid": "0",
187
+ "nnodes": "1",
188
+ "nodeid": "0",
189
+ "nodelist": "auh7-1b-gpu-316",
190
+ "nprocs": "1",
191
+ "ntasks": "1",
192
+ "ntasks_per_node": "1",
193
+ "oom_kill_step": "0",
194
+ "prio_process": "0",
195
+ "procid": "0",
196
+ "submit_dir": "/vast/users/xiaodan/zhangjian/A1/launch_scripts",
197
+ "submit_host": "auh-1b-cpu-login-001",
198
+ "task_pid": "1925818",
199
+ "tasks_per_node": "1",
200
+ "topology_addr": "auh7-1b-gpu-316",
201
+ "topology_addr_pattern": "node"
202
+ },
203
+ "writerId": "9zghejqbkg668a368vduhoyzhbv4wgq6"
204
+ }
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_step":0,"System/Peak GPU Memory (MB)":35614.78125,"_timestamp":1.7599418260827e+09,"_wandb":{"runtime":325},"_runtime":325.049768384}
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:32.182353704Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpl72x4viw/port-1926008.txt","pid":1926008,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-10-08T16:38:32.183512473Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1926008}
3
+ {"time":"2025-10-08T16:38:32.184652392Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1926008-1926185-2566029014/socket","Net":"unix"}}
4
+ {"time":"2025-10-08T16:38:32.187758092Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-10-08T16:38:32.198021978Z","level":"INFO","msg":"handleInformInit: received","streamId":"76mxu43t","id":"1(@)"}
6
+ {"time":"2025-10-08T16:38:33.339879992Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"76mxu43t","id":"1(@)"}
7
+ {"time":"2025-10-08T16:44:00.126902419Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"76mxu43t","id":"1(@)"}
8
+ {"time":"2025-10-08T16:44:00.130282224Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"76mxu43t","id":"1(@)"}
9
+ {"time":"2025-10-08T16:44:00.192589494Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-10-08T16:44:00.192627935Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-10-08T16:44:00.192634215Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-10-08T16:44:00.192643625Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-10-08T16:44:00.192695596Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2025-10-08T16:44:00.192701166Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2025-10-08T16:44:00.192695596Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-1926008-1926185-2566029014/socket","Net":"unix"}}
16
+ {"time":"2025-10-08T16:44:00.192730717Z","level":"INFO","msg":"server is closed"}
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-08T16:38:32.19998745Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-08T16:38:33.339827981Z","level":"INFO","msg":"stream: created new stream","id":"76mxu43t"}
3
+ {"time":"2025-10-08T16:38:33.339874102Z","level":"INFO","msg":"stream: started","id":"76mxu43t"}
4
+ {"time":"2025-10-08T16:38:33.339905492Z","level":"INFO","msg":"handler: started","stream_id":"76mxu43t"}
5
+ {"time":"2025-10-08T16:38:33.339893552Z","level":"INFO","msg":"writer: started","stream_id":"76mxu43t"}
6
+ {"time":"2025-10-08T16:38:33.339947873Z","level":"INFO","msg":"sender: started","stream_id":"76mxu43t"}
7
+ {"time":"2025-10-08T16:43:58.756754711Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.006574606}],"total_operations":1}}
8
+ {"time":"2025-10-08T16:43:59.766243448Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-10-08T16:44:00.127335826Z","level":"INFO","msg":"stream: closing","id":"76mxu43t"}
10
+ {"time":"2025-10-08T16:44:00.127349836Z","level":"INFO","msg":"handler: closed","stream_id":"76mxu43t"}
11
+ {"time":"2025-10-08T16:44:00.128408003Z","level":"INFO","msg":"sender: closed","stream_id":"76mxu43t"}
12
+ {"time":"2025-10-08T16:44:00.128424754Z","level":"INFO","msg":"stream: closed","id":"76mxu43t"}
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/logs/debug.log ADDED
File without changes
cleandesk_l1_regression/wandb/wandb/run-20251008_163831-76mxu43t/run-76mxu43t.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:592c2180c1f58861cad10bcce3042c15f3fa428cb470d5da0ac15865d108e860
3
+ size 205758
eraser_flow_matching/step11500-action-head/metadata.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb72b6306ce04d1beb20bb289509f00c39a40845ff7c4b36bf4deb4e83fe82a
3
+ size 1331
eraser_flow_matching/step12000-action-head/metadata.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995307502120af3866f237cd0bc484fc848a652539d28e53cbea882abc16ba6b
3
+ size 1331
eraser_flow_matching/step12000-unsharded/config.yaml ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: eraser_20251011_163756
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: openai
22
+ image_default_input_size:
23
+ - 336
24
+ - 336
25
+ image_patch_size: 14
26
+ image_pos_patch_size: 14
27
+ image_emb_dim: 1024
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 23
31
+ image_head_dim: 64
32
+ image_mlp_dim: 4096
33
+ image_mlp_activations: quick_gelu
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 577
36
+ image_norm_eps: 1.0e-05
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: default
42
+ vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
43
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
44
+ low_cpu_fsdp: true
45
+ attention_type: sdpa
46
+ float32_attention: true
47
+ attention_dropout: 0.0
48
+ attention_layer_norm: false
49
+ residual_dropout: 0.1
50
+ response_residual_dropout: 0.0
51
+ embedding_dropout: 0.0
52
+ layer_norm_type: rms
53
+ layer_norm_with_affine: true
54
+ layer_norm_eps: 1.0e-06
55
+ attention_layer_norm_with_affine: true
56
+ max_sequence_length: 4096
57
+ max_position_embeddings: null
58
+ include_bias: false
59
+ bias_for_layer_norm: null
60
+ scale_logits: false
61
+ vocab_size: 152064
62
+ embedding_size: 152064
63
+ ff_out_size: null
64
+ additional_vocab_size: 128
65
+ new_embedding_init_range: 0.02
66
+ weight_tying: false
67
+ init_device: null
68
+ init_fn: normal
69
+ init_std: 0.02
70
+ init_cutoff_factor: null
71
+ norm_after: false
72
+ precision: amp_bf16
73
+ max_crops: 12
74
+ crop_mode: overlap-and-resize-c2
75
+ use_col_tokens: true
76
+ prompt_type: uber_model
77
+ system_prompt_kind: demo_or_style
78
+ message_formatting: role
79
+ always_start_with_space: true
80
+ multi_annotation_weighting: root_subsegments
81
+ default_inference_len: 65
82
+ overlap_margins:
83
+ - 4
84
+ - 4
85
+ pad_value: 0.0
86
+ image_padding_embed: pad_and_partial_pad
87
+ fix_image_padding: true
88
+ vit_layers:
89
+ - -2
90
+ - -9
91
+ image_pooling_h: 2
92
+ image_pooling_w: 2
93
+ image_pooling_2d: attention_meanq
94
+ image_projector: mlp
95
+ image_feature_dropout: 0.0
96
+ initializer_range: 0.02
97
+ normalize_input_embeds: false
98
+ use_position_ids: true
99
+ head_dim: null
100
+ action_tokenizer:
101
+ identifier: physical-intelligence/fast
102
+ tokenizer_dir: null
103
+ action_dim: 7
104
+ horizon: 8
105
+ tokenizer:
106
+ identifier: Qwen/Qwen2-7B
107
+ tokenizer_dir: null
108
+ pad_tokenizer: true
109
+ moe_num_experts: 8
110
+ moe_top_k: 2
111
+ moe_mlp_impl: sparse
112
+ moe_log_expert_assignment: false
113
+ moe_shared_expert: false
114
+ moe_lbl_in_fp32: false
115
+ moe_interleave: false
116
+ moe_loss_weight: 0.1
117
+ moe_zloss_weight: null
118
+ moe_dropless: true
119
+ moe_capacity_factor: 1.25
120
+ action_head: flow_matching
121
+ num_diffusion_steps: 1000
122
+ num_diffusion_inference_steps: 30
123
+ use_proprio: true
124
+ action_head_dit_hidden_size: 1152
125
+ action_head_dit_depth: 28
126
+ action_head_dit_num_heads: 16
127
+ llm_causal_attention: false
128
+ action_use_left_eef: true
129
+ action_use_mobile_base: false
130
+ allow_resume: false
131
+ ft_llm: true
132
+ ft_vit: false
133
+ ft_connector: false
134
+ ft_embedding: lm_head
135
+ lora: false
136
+ use_lora: false
137
+ lora_rank: 8
138
+ lora_llm: false
139
+ lora_vit: false
140
+ lora_connector: false
141
+ early_exit: false
142
+ train_exit_random_layer: false
143
+ optimizer:
144
+ name: adamw
145
+ learning_rate: 0.0001
146
+ weight_decay: 0.01
147
+ betas:
148
+ - 0.9
149
+ - 0.95
150
+ eps: 1.0e-05
151
+ connector_learning_rate: 0.0002
152
+ vit_learning_rate: 6.0e-06
153
+ llm_learning_rate: 5.0e-05
154
+ connector_weight_decay: 0.0
155
+ vit_weight_decay: 0.0
156
+ llm_weight_decay: 0.0
157
+ connector_betas:
158
+ - 0.9
159
+ - 0.95
160
+ vit_betas:
161
+ - 0.9
162
+ - 0.95
163
+ llm_betas:
164
+ - 0.9
165
+ - 0.95
166
+ connector_eps: 1.0e-06
167
+ vit_eps: 1.0e-06
168
+ llm_eps: 1.0e-06
169
+ metrics_log_interval: 20
170
+ scheduler:
171
+ name: multimodal
172
+ units: steps
173
+ t_warmup: 100
174
+ t_max: null
175
+ alpha_f: 0.1
176
+ connector_t_warmup: 200
177
+ vit_t_warmup: 2000
178
+ llm_t_warmup: 2000
179
+ grad_clip_warmup_steps: null
180
+ grad_clip_warmup_factor: null
181
+ warmup_min_lr: 0.0
182
+ data:
183
+ dataset: vla_dataset_realworld
184
+ mixture: null
185
+ root_size_mixture: null
186
+ split: train
187
+ seed: 95818
188
+ shuffle_messages: false
189
+ pad: to_max
190
+ sequence_length: 1600
191
+ shuffle: true
192
+ for_inference: false
193
+ multi_modal: torch
194
+ num_workers: 0
195
+ drop_last: true
196
+ pin_memory: true
197
+ prefetch_factor: null
198
+ persistent_workers: false
199
+ timeout: 0
200
+ rlds_dataset_name: a1_real_world
201
+ rlds_data_root_dir: /vast/users/xiaodan/zhangjian/datasets/OXE
202
+ use_wrist_image: true
203
+ use_proprio: true
204
+ rlds_shuffle_buffer_size: 100000
205
+ rlds_traj_threads: 8
206
+ rlds_read_threads: 8
207
+ lerobot_episode_index_start: null
208
+ lerobot_episode_index_end: null
209
+ restore_dataloader: true
210
+ fast_forward_batches: null
211
+ evaluators:
212
+ - label: val
213
+ data:
214
+ dataset: vla_dataset_realworld
215
+ mixture: null
216
+ root_size_mixture: null
217
+ split: validation
218
+ seed: null
219
+ shuffle_messages: false
220
+ pad: to_max
221
+ sequence_length: 1600
222
+ shuffle: false
223
+ for_inference: false
224
+ multi_modal: torch
225
+ num_workers: 0
226
+ drop_last: true
227
+ pin_memory: true
228
+ prefetch_factor: null
229
+ persistent_workers: true
230
+ timeout: 0
231
+ rlds_dataset_name: libero_4_task_suites_no_noops
232
+ rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds
233
+ use_wrist_image: true
234
+ use_proprio: true
235
+ rlds_shuffle_buffer_size: 256000
236
+ rlds_traj_threads: 8
237
+ rlds_read_threads: 8
238
+ lerobot_episode_index_start: 353
239
+ lerobot_episode_index_end: 765
240
+ device_eval_batch_size: null
241
+ subset_num_batches: 64
242
+ max_examples: null
243
+ max_new_tokens: 448
244
+ mm_evaluator: null
245
+ save_dir: null
246
+ save_to_checkpoint_dir: false
247
+ eval_name: null
248
+ skip_if_metrics_cached: true
249
+ eval_interval: 0
250
+ inf_eval_interval: -1
251
+ inf_evaluators: []
252
+ save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/eraser_flow_matching
253
+ remote_save_folder: null
254
+ canceled_check_interval: 50
255
+ save_interval: 500
256
+ save_interval_unsharded: 500
257
+ save_interval_ephemeral: null
258
+ save_interval_action_head: 500
259
+ save_num_checkpoints_to_keep: 1
260
+ save_num_unsharded_checkpoints_to_keep: 1
261
+ save_num_action_head_checkpoints_to_keep: 2
262
+ save_overwrite: true
263
+ force_save_unsharded: false
264
+ no_pre_train_checkpoint: true
265
+ initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
266
+ load_model_config: null
267
+ checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924
268
+ load_path: null
269
+ load_path_sharded_checkpointer: null
270
+ reset_optimizer_state: false
271
+ reset_trainer_state: false
272
+ save_dataloader_state: false
273
+ reset_dataloader_state: false
274
+ keep_lr_on_load: true
275
+ sharded_checkpointer: torch_legacy
276
+ max_duration: 500000
277
+ global_train_batch_size: 126
278
+ device_train_batch_size: 15
279
+ device_train_microbatch_size: 16
280
+ device_eval_batch_size: 4
281
+ eval_subset_num_batches: -1
282
+ eval_on_load: false
283
+ device_inf_eval_batch_size: 16
284
+ inf_eval_subset_num_batches: -1
285
+ device_train_grad_accum: 0
286
+ max_grad_norm: 1.0
287
+ multi_component_grad_norm: true
288
+ batch_divisor: global_batch
289
+ max_grad_norm_ratio: null
290
+ precision: amp_bf16
291
+ wandb:
292
+ project: a1-realworld
293
+ entity: henryeap
294
+ group: null
295
+ name: eraser_20251011_163756
296
+ tags:
297
+ - watching
298
+ log_artifacts: false
299
+ rank_zero_only: true
300
+ log_interval: 1
301
+ speed_monitor:
302
+ window_size: 20
303
+ gpu_flops_available: null
304
+ console_log_interval: 1
305
+ gen1_gc_interval: 1
306
+ compile: null
307
+ fsdp:
308
+ use_orig_params: true
309
+ sharding_strategy: FULL_SHARD
310
+ wrapping_strategy: by_block_and_size
311
+ precision: float
312
+ hybrid_sharding_num_model_replicas: null
313
+ softmax_auxiliary_loss: true
314
+ softmax_auxiliary_loss_scale: 0.0001
315
+ time_limit: null
316
+ extra_steps_after_cancel: 10
317
+ python_profiling: false
318
+ torch_profiling: false
319
+ stop_at: 500000
320
+ stop_after: null
321
+ activation_checkpointing: whole_layer
322
+ fused_loss: null
eraser_flow_matching/step12000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27b7944a86ee4e879d03cec156d2128c480602778b651762977174d39e5f94ab
3
+ size 15061
eraser_flow_matching/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-10-11T16:38:32.587302182Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
2
+ {"time":"2025-10-11T16:38:33.732885593Z","level":"INFO","msg":"stream: created new stream","id":"yqnt28c8"}
3
+ {"time":"2025-10-11T16:38:33.732932354Z","level":"INFO","msg":"stream: started","id":"yqnt28c8"}
4
+ {"time":"2025-10-11T16:38:33.732959824Z","level":"INFO","msg":"writer: started","stream_id":"yqnt28c8"}
5
+ {"time":"2025-10-11T16:38:33.732985135Z","level":"INFO","msg":"handler: started","stream_id":"yqnt28c8"}
6
+ {"time":"2025-10-11T16:38:33.732961384Z","level":"INFO","msg":"sender: started","stream_id":"yqnt28c8"}
7
+ {"time":"2025-10-13T08:15:22.219814038Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
8
+ {"time":"2025-10-13T20:01:35.03758236Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
eraser_flow_matching/wandb/wandb/debug.log ADDED
File without changes