Dongkkka commited on
Commit
654fde4
·
verified ·
1 Parent(s): edea723

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 40,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": false,
5
+ "architectures": [
6
+ "Gr00tN1d7"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_trainable_params_fp32": true,
12
+ "color_jitter_params": {
13
+ "brightness": 0.3,
14
+ "contrast": 0.4,
15
+ "hue": 0.08,
16
+ "saturation": 0.5
17
+ },
18
+ "crop_fraction": 0.95,
19
+ "diffusion_model_cfg": {
20
+ "attention_head_dim": 48,
21
+ "dropout": 0.2,
22
+ "final_dropout": true,
23
+ "interleave_self_attention": true,
24
+ "norm_type": "ada_norm",
25
+ "num_attention_heads": 32,
26
+ "num_layers": 32,
27
+ "output_dim": 1024,
28
+ "positional_embeddings": null
29
+ },
30
+ "dtype": "float32",
31
+ "exclude_state": false,
32
+ "formalize_language": true,
33
+ "hidden_size": 1024,
34
+ "image_crop_size": [
35
+ 230,
36
+ 230
37
+ ],
38
+ "image_target_size": [
39
+ 256,
40
+ 256
41
+ ],
42
+ "letter_box_transform": false,
43
+ "load_bf16": false,
44
+ "max_action_dim": 132,
45
+ "max_num_embodiments": 32,
46
+ "max_seq_len": 1024,
47
+ "max_state_dim": 132,
48
+ "model_dtype": "bfloat16",
49
+ "model_name": "nvidia/Cosmos-Reason2-2B",
50
+ "model_type": "Gr00tN1d7",
51
+ "noise_beta_alpha": 1.5,
52
+ "noise_beta_beta": 1.0,
53
+ "noise_s": 0.999,
54
+ "num_inference_timesteps": 4,
55
+ "num_timestep_buckets": 1000,
56
+ "random_history_crop": true,
57
+ "random_rotation_angle": 0,
58
+ "reproject_vision": false,
59
+ "rtc_ramp_rate": 6.0,
60
+ "select_layer": 16,
61
+ "shortest_image_edge": 256,
62
+ "state_dropout_prob": 0.2,
63
+ "state_gaussian_noise_std": 0.0,
64
+ "transformers_version": "4.57.3",
65
+ "tune_diffusion_model": true,
66
+ "tune_linear": true,
67
+ "tune_llm": false,
68
+ "tune_projector": true,
69
+ "tune_top_llm_layers": 0,
70
+ "tune_visual": false,
71
+ "tune_vlln": true,
72
+ "use_albumentations": true,
73
+ "use_alternate_vl_dit": true,
74
+ "use_flash_attention": true,
75
+ "use_future_tokens": false,
76
+ "use_mean_std": false,
77
+ "use_percentiles": true,
78
+ "use_vl_self_attention": true,
79
+ "use_vlln": true,
80
+ "vl_self_attention_cfg": {
81
+ "attention_head_dim": 64,
82
+ "dropout": 0.2,
83
+ "final_dropout": true,
84
+ "num_attention_heads": 32,
85
+ "num_layers": 4,
86
+ "positional_embeddings": null
87
+ }
88
+ }
embodiment_id.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "oxe_droid": 17,
4
+ "oxe_fractal": 18,
5
+ "oxe_language_table": 19,
6
+ "oxe_bridge": 20,
7
+ "unknown": 22,
8
+ "gr1_unified": 20,
9
+ "agibot": 26,
10
+ "sim_behavior_r1_pro": 23,
11
+ "xdof": 24,
12
+ "xdof_oss_data": 25,
13
+ "unitree_g1_full_body_with_waist_height_nav_cmd": 25,
14
+ "real_r1_pro_sharpa": 27,
15
+ "real_r1_pro_sharpa_add_view": 27,
16
+ "real_r1_pro_sharpa_relative_arm_joint": 26,
17
+ "real_r1_pro_sharpa_delta_eef": 26,
18
+ "real_r1_pro_sharpa_absolute_eef": 26,
19
+ "real_r1_pro_sharpa_meanstd": 26,
20
+ "real_r1_pro_sharpa_relative_eef": 26,
21
+ "real_r1_pro_sharpa_relative_eef_add_view": 26,
22
+ "real_r1_pro_sharpa_relative_eef_relative_hand": 26,
23
+ "real_r1_pro_sharpa_relative_eef_human": 26,
24
+ "real_r1_pro_sharpa_relative_eef_human_add_view": 26,
25
+ "real_r1_pro_sharpa_relative_eef_human_relative_hand": 26,
26
+ "real_r1_pro_sharpa_relative_eef_egodex": 26,
27
+ "real_r1_pro_sharpa_relative_eef_egodex_relative_hand": 26,
28
+ "real_r1_pro_sharpa_relative_eef_egodex_wrist_only": 26,
29
+ "real_r1_pro_sharpa_relative_eef_maxinsights": 26,
30
+ "real_r1_pro_sharpa_relative_eef_maxinsights_relative_hand": 26,
31
+ "real_r1_pro_sharpa_relative_eef_mecka": 26,
32
+ "real_r1_pro_sharpa_relative_eef_mecka_relative_hand": 26,
33
+ "real_g1_relative_eef_absolute_joints": 25,
34
+ "real_g1_relative_eef_absolute_joints_wrist_cam": 25,
35
+ "real_g1_relative_eef_relative_joints": 25,
36
+ "real_r1_pro_sharpa_relative_eef_relative_hand_relative_joint": 26,
37
+ "real_r1_pro_sharpa_relative_joint": 29,
38
+ "oxe_droid_relative_eef_relative_joint": 24,
39
+ "oxe_droid_relative_eef_relative_joint_swapped": 24,
40
+ "oxe_droid_relative_eef_relative_joint_upweight_z": 24,
41
+ "oxe_droid_relative_eef_relative_joint_upweight_z_swapped": 24,
42
+ "oxe_droid_relative_eef_relative_joint_3view": 24,
43
+ "oxe_droid_relative_eef_relative_joint_3view_swapped": 24,
44
+ "oxe_droid_relative_eef": 24,
45
+ "oxe_droid_joint_position_relative": 24,
46
+ "xdof_relative_eef_relative_joint": 27,
47
+ "xdof_relative_eef_relative_joint_subtask": 27,
48
+ "xdof_relative_eef": 27,
49
+ "xdof_relative_joint": 28,
50
+ "simpler_env_google": 0,
51
+ "simpler_env_widowx": 1,
52
+ "libero_sim": 2,
53
+ "droid_sim": 3,
54
+ "new_embodiment": 10
55
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d7
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Cosmos-Reason2-2B
6
+ backbone_model_type: qwen
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 12
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ backbone_trainable_params_fp32: true
17
+ image_crop_size:
18
+ - 230
19
+ - 230
20
+ image_target_size:
21
+ - 256
22
+ - 256
23
+ shortest_image_edge: null
24
+ crop_fraction: null
25
+ random_rotation_angle: null
26
+ color_jitter_params: null
27
+ use_albumentations_transforms: true
28
+ extra_augmentation_config: null
29
+ formalize_language: true
30
+ apply_sincos_state_encoding: false
31
+ use_percentiles: true
32
+ use_relative_action: true
33
+ max_state_dim: 132
34
+ max_action_dim: 132
35
+ action_horizon: 40
36
+ hidden_size: 1024
37
+ input_embedding_dim: 1536
38
+ state_history_length: 1
39
+ add_pos_embed: true
40
+ attn_dropout: 0.2
41
+ use_vlln: true
42
+ max_seq_len: 1024
43
+ use_alternate_vl_dit: true
44
+ attend_text_every_n_blocks: 2
45
+ diffusion_model_cfg:
46
+ positional_embeddings: null
47
+ num_layers: 16
48
+ num_attention_heads: 32
49
+ attention_head_dim: 48
50
+ norm_type: ada_norm
51
+ dropout: 0.2
52
+ final_dropout: true
53
+ output_dim: 1024
54
+ interleave_self_attention: true
55
+ num_inference_timesteps: 4
56
+ noise_beta_alpha: 1.5
57
+ noise_beta_beta: 1.0
58
+ noise_s: 0.999
59
+ num_timestep_buckets: 1000
60
+ tune_projector: true
61
+ tune_diffusion_model: true
62
+ tune_vlln: true
63
+ state_dropout_prob: 0.2
64
+ exclude_state: false
65
+ use_mean_std: false
66
+ max_num_embodiments: 32
67
+ data:
68
+ datasets:
69
+ - dataset_paths:
70
+ - /data/datasets/Task_0012_CleanTableTrash_primitives
71
+ embodiment_tag: new_embodiment
72
+ mix_ratio: 1.0
73
+ dataset_type: physical_embodiment
74
+ val_dataset_path: null
75
+ modality_configs:
76
+ new_embodiment:
77
+ video:
78
+ delta_indices:
79
+ - 0
80
+ modality_keys:
81
+ - cam_left_head
82
+ - cam_left_wrist
83
+ - cam_right_wrist
84
+ sin_cos_embedding_keys: null
85
+ mean_std_embedding_keys: null
86
+ action_configs: null
87
+ state:
88
+ delta_indices:
89
+ - 0
90
+ modality_keys:
91
+ - arm_left
92
+ - arm_right
93
+ - head
94
+ - lift
95
+ - odometry
96
+ sin_cos_embedding_keys: null
97
+ mean_std_embedding_keys: null
98
+ action_configs: null
99
+ action:
100
+ delta_indices:
101
+ - 0
102
+ - 1
103
+ - 2
104
+ - 3
105
+ - 4
106
+ - 5
107
+ - 6
108
+ - 7
109
+ - 8
110
+ - 9
111
+ - 10
112
+ - 11
113
+ - 12
114
+ - 13
115
+ - 14
116
+ - 15
117
+ modality_keys:
118
+ - arm_left
119
+ - arm_right
120
+ - head
121
+ - lift
122
+ - odometry
123
+ sin_cos_embedding_keys: null
124
+ mean_std_embedding_keys: null
125
+ action_configs:
126
+ - rep: ABSOLUTE
127
+ type: NON_EEF
128
+ format: DEFAULT
129
+ state_key: null
130
+ - rep: ABSOLUTE
131
+ type: NON_EEF
132
+ format: DEFAULT
133
+ state_key: null
134
+ - rep: ABSOLUTE
135
+ type: NON_EEF
136
+ format: DEFAULT
137
+ state_key: null
138
+ - rep: ABSOLUTE
139
+ type: NON_EEF
140
+ format: DEFAULT
141
+ state_key: null
142
+ - rep: ABSOLUTE
143
+ type: NON_EEF
144
+ format: DEFAULT
145
+ state_key: null
146
+ language:
147
+ delta_indices:
148
+ - 0
149
+ modality_keys:
150
+ - annotation.human.primitive_instruction
151
+ sin_cos_embedding_keys: null
152
+ mean_std_embedding_keys: null
153
+ action_configs: null
154
+ download_cache: false
155
+ shard_size: 1024
156
+ episode_sampling_rate: 0.1
157
+ num_shards_per_epoch: 100000
158
+ override_pretraining_statistics: true
159
+ mode: single_turn
160
+ random_chop: 0.0
161
+ mock_dataset_mode: false
162
+ shuffle: true
163
+ seed: 42
164
+ multiprocessing_context: fork
165
+ allow_padding: false
166
+ subsample_ratio: 1.0
167
+ image_crop_size:
168
+ - 244
169
+ - 244
170
+ image_target_size:
171
+ - 224
172
+ - 224
173
+ video_backend: torchcodec
174
+ training:
175
+ output_dir: /data/checkpoints
176
+ experiment_name: ffw_sg2_0504_0444
177
+ max_steps: 100000
178
+ global_batch_size: 4
179
+ batch_size: null
180
+ gradient_accumulation_steps: 1
181
+ learning_rate: 0.0001
182
+ lr_scheduler_type: cosine
183
+ weight_decay: 1.0e-05
184
+ warmup_ratio: 0.05
185
+ warmup_steps: 0
186
+ max_grad_norm: 1.0
187
+ optim: adamw_torch
188
+ start_from_checkpoint: nvidia/GR00T-N1.7-3B
189
+ skip_weight_loading: false
190
+ tf32: true
191
+ fp16: false
192
+ bf16: true
193
+ eval_bf16: true
194
+ logging_steps: 10
195
+ save_steps: 5000
196
+ save_total_limit: 2
197
+ save_vl_model: false
198
+ save_only_model: false
199
+ upload_checkpoints: false
200
+ upload_every: 1000
201
+ upload_last_n_checkpoints: 5
202
+ max_concurrent_uploads: 2
203
+ eval_strategy: 'no'
204
+ eval_steps: 500
205
+ eval_set_split_ratio: 0.1
206
+ eval_batch_size: 2
207
+ save_best_eval_metric_name: ''
208
+ save_best_eval_metric_greater_is_better: true
209
+ deepspeed_stage: 2
210
+ gradient_checkpointing: false
211
+ transformers_trust_remote_code: true
212
+ transformers_local_files_only: false
213
+ transformers_cache_dir: null
214
+ transformers_access_token: null
215
+ use_ddp: false
216
+ ddp_bucket_cap_mb: 100
217
+ num_gpus: 1
218
+ dataloader_num_workers: 4
219
+ remove_unused_columns: false
220
+ use_wandb: false
221
+ wandb_project: finetune-gr00t-n1d7
222
+ enable_profiling: false
223
+ max_retries: 3
224
+ assert_loss_less_than: null
225
+ add_rl_callback: false
226
+ enable_open_loop_eval: false
227
+ open_loop_eval_traj_ids:
228
+ - 0
229
+ open_loop_eval_steps_per_traj: 100
230
+ open_loop_eval_plot_indices: null
231
+ max_steps: 100000
232
+ save_steps: 5000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /data/datasets/Task_0012_CleanTableTrash_primitives
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ - !!python/object:gr00t.data.types.ActionConfig
39
+ format: *id001
40
+ rep: *id002
41
+ state_key: null
42
+ type: *id003
43
+ - !!python/object:gr00t.data.types.ActionConfig
44
+ format: *id001
45
+ rep: *id002
46
+ state_key: null
47
+ type: *id003
48
+ - !!python/object:gr00t.data.types.ActionConfig
49
+ format: *id001
50
+ rep: *id002
51
+ state_key: null
52
+ type: *id003
53
+ delta_indices:
54
+ - 0
55
+ - 1
56
+ - 2
57
+ - 3
58
+ - 4
59
+ - 5
60
+ - 6
61
+ - 7
62
+ - 8
63
+ - 9
64
+ - 10
65
+ - 11
66
+ - 12
67
+ - 13
68
+ - 14
69
+ - 15
70
+ mean_std_embedding_keys: null
71
+ modality_keys:
72
+ - arm_left
73
+ - arm_right
74
+ - head
75
+ - lift
76
+ - odometry
77
+ sin_cos_embedding_keys: null
78
+ language: !!python/object:gr00t.data.types.ModalityConfig
79
+ action_configs: null
80
+ delta_indices:
81
+ - 0
82
+ mean_std_embedding_keys: null
83
+ modality_keys:
84
+ - annotation.human.primitive_instruction
85
+ sin_cos_embedding_keys: null
86
+ state: !!python/object:gr00t.data.types.ModalityConfig
87
+ action_configs: null
88
+ delta_indices:
89
+ - 0
90
+ mean_std_embedding_keys: null
91
+ modality_keys:
92
+ - arm_left
93
+ - arm_right
94
+ - head
95
+ - lift
96
+ - odometry
97
+ sin_cos_embedding_keys: null
98
+ video: !!python/object:gr00t.data.types.ModalityConfig
99
+ action_configs: null
100
+ delta_indices:
101
+ - 0
102
+ mean_std_embedding_keys: null
103
+ modality_keys:
104
+ - cam_left_head
105
+ - cam_left_wrist
106
+ - cam_right_wrist
107
+ sin_cos_embedding_keys: null
108
+ mode: single_turn
109
+ multiprocessing_context: fork
110
+ num_shards_per_epoch: 100000
111
+ override_pretraining_statistics: true
112
+ random_chop: 0.0
113
+ seed: 42
114
+ shard_size: 1024
115
+ shuffle: true
116
+ subsample_ratio: 1.0
117
+ video_backend: torchcodec
118
+ load_config_path: null
119
+ model: !!python/object:gr00t.configs.model.gr00t_n1d7.Gr00tN1d7Config
120
+ _attn_implementation_internal: null
121
+ _commit_hash: null
122
+ _name_or_path: ''
123
+ _output_attentions: false
124
+ add_cross_attention: false
125
+ architectures: null
126
+ backbone_trainable_params_fp32: true
127
+ bad_words_ids: null
128
+ begin_suppress_tokens: null
129
+ bos_token_id: null
130
+ chunk_size_feed_forward: 0
131
+ color_jitter_params: null
132
+ cross_attention_hidden_size: null
133
+ decoder_start_token_id: null
134
+ diffusion_model_cfg:
135
+ attention_head_dim: 48
136
+ dropout: 0.2
137
+ final_dropout: true
138
+ interleave_self_attention: true
139
+ norm_type: ada_norm
140
+ num_attention_heads: 32
141
+ num_layers: 16
142
+ output_dim: 1024
143
+ positional_embeddings: null
144
+ diversity_penalty: 0.0
145
+ do_sample: false
146
+ dtype: null
147
+ early_stopping: false
148
+ encoder_no_repeat_ngram_size: 0
149
+ eos_token_id: null
150
+ exponential_decay_length_penalty: null
151
+ extra_augmentation_config: null
152
+ finetuning_task: null
153
+ forced_bos_token_id: null
154
+ forced_eos_token_id: null
155
+ id2label:
156
+ 0: LABEL_0
157
+ 1: LABEL_1
158
+ is_decoder: false
159
+ is_encoder_decoder: false
160
+ label2id:
161
+ LABEL_0: 0
162
+ LABEL_1: 1
163
+ length_penalty: 1.0
164
+ load_bf16: false
165
+ max_length: 20
166
+ min_length: 0
167
+ model_name: nvidia/Cosmos-Reason2-2B
168
+ no_repeat_ngram_size: 0
169
+ num_beam_groups: 1
170
+ num_beams: 1
171
+ num_return_sequences: 1
172
+ output_hidden_states: false
173
+ output_scores: false
174
+ pad_token_id: null
175
+ prefix: null
176
+ problem_type: null
177
+ pruned_heads: {}
178
+ random_rotation_angle: null
179
+ remove_invalid_values: false
180
+ repetition_penalty: 1.0
181
+ reproject_vision: false
182
+ return_dict: true
183
+ return_dict_in_generate: false
184
+ sep_token_id: null
185
+ state_dropout_prob: 0.2
186
+ suppress_tokens: null
187
+ task_specific_params: null
188
+ temperature: 1.0
189
+ tf_legacy_loss: false
190
+ tie_encoder_decoder: false
191
+ tie_word_embeddings: true
192
+ tokenizer_class: null
193
+ top_k: 50
194
+ top_p: 1.0
195
+ torchscript: false
196
+ transformers_version: null
197
+ tune_diffusion_model: true
198
+ tune_llm: false
199
+ tune_projector: true
200
+ tune_visual: false
201
+ typical_p: 1.0
202
+ use_bfloat16: false
203
+ use_relative_action: true
204
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
205
+ add_rl_callback: false
206
+ assert_loss_less_than: null
207
+ batch_size: null
208
+ bf16: true
209
+ dataloader_num_workers: 4
210
+ ddp_bucket_cap_mb: 100
211
+ deepspeed_stage: 2
212
+ enable_open_loop_eval: false
213
+ enable_profiling: false
214
+ eval_batch_size: 2
215
+ eval_bf16: true
216
+ eval_set_split_ratio: 0.1
217
+ eval_steps: 500
218
+ eval_strategy: 'no'
219
+ experiment_name: ffw_sg2_0504_0444
220
+ fp16: false
221
+ global_batch_size: 4
222
+ gradient_accumulation_steps: 1
223
+ gradient_checkpointing: false
224
+ learning_rate: 0.0001
225
+ logging_steps: 10
226
+ lr_scheduler_type: cosine
227
+ max_concurrent_uploads: 2
228
+ max_grad_norm: 1.0
229
+ max_retries: 3
230
+ max_steps: 100000
231
+ num_gpus: 1
232
+ open_loop_eval_plot_indices: null
233
+ open_loop_eval_steps_per_traj: 100
234
+ open_loop_eval_traj_ids:
235
+ - 0
236
+ optim: adamw_torch
237
+ output_dir: /data/checkpoints
238
+ remove_unused_columns: false
239
+ save_best_eval_metric_greater_is_better: true
240
+ save_best_eval_metric_name: ''
241
+ save_only_model: false
242
+ save_steps: 5000
243
+ save_total_limit: 2
244
+ save_vl_model: false
245
+ skip_weight_loading: false
246
+ start_from_checkpoint: nvidia/GR00T-N1.7-3B
247
+ tf32: true
248
+ transformers_access_token: null
249
+ transformers_cache_dir: null
250
+ transformers_local_files_only: false
251
+ transformers_trust_remote_code: true
252
+ upload_checkpoints: false
253
+ upload_every: 1000
254
+ upload_last_n_checkpoints: 5
255
+ use_ddp: false
256
+ use_wandb: false
257
+ wandb_project: finetune-gr00t-n1d7
258
+ warmup_ratio: 0.05
259
+ warmup_steps: 0
260
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm_left": {
5
+ "min": [
6
+ -0.4248647391796112,
7
+ 0.0291815884411335,
8
+ -1.2500265836715698,
9
+ -2.3462235927581787,
10
+ -1.330955982208252,
11
+ -1.0271918773651123,
12
+ -1.4225958585739136,
13
+ 0.0
14
+ ],
15
+ "max": [
16
+ 1.060687780380249,
17
+ 0.9034667611122131,
18
+ 0.5867117047309875,
19
+ -0.5952804088592529,
20
+ 0.8997636437416077,
21
+ 1.0742899179458618,
22
+ 1.0339692831039429,
23
+ 1.0139613151550293
24
+ ],
25
+ "mean": [
26
+ 0.10058338940143585,
27
+ 0.27535390853881836,
28
+ 0.05904501676559448,
29
+ -1.7770419120788574,
30
+ -0.12517733871936798,
31
+ 0.571687638759613,
32
+ -0.9227365851402283,
33
+ 0.28754305839538574
34
+ ],
35
+ "std": [
36
+ 0.26586952805519104,
37
+ 0.10868094861507416,
38
+ 0.30604055523872375,
39
+ 0.2976902425289154,
40
+ 0.36704811453819275,
41
+ 0.386583536863327,
42
+ 0.6600474119186401,
43
+ 0.3211342692375183
44
+ ],
45
+ "q01": [
46
+ -0.24966759622097015,
47
+ 0.09792309999465942,
48
+ -1.007131450176239,
49
+ -2.053797607421875,
50
+ -1.012241334915161,
51
+ -0.4580974680185318,
52
+ -1.422006368637085,
53
+ 0.05215534567832947
54
+ ],
55
+ "q99": [
56
+ 0.8234109246730792,
57
+ 0.7166465771198269,
58
+ 0.289665434360504,
59
+ -0.7995661628246308,
60
+ 0.7091967177391052,
61
+ 0.8360075354576111,
62
+ 0.8503720080852498,
63
+ 1.0108933448791504
64
+ ]
65
+ },
66
+ "arm_right": {
67
+ "min": [
68
+ -0.13799834251403809,
69
+ -0.8309382200241089,
70
+ -0.6199079751968384,
71
+ -2.6787378787994385,
72
+ -0.6936229467391968,
73
+ -1.0333278179168701,
74
+ -0.0980280339717865,
75
+ 0.013805827125906944
76
+ ],
77
+ "max": [
78
+ 1.2520039081573486,
79
+ -0.16753946244716644,
80
+ 1.225195288658142,
81
+ -1.0728877782821655,
82
+ 0.6754189133644104,
83
+ 1.031781792640686,
84
+ 1.364180326461792,
85
+ 0.5967185497283936
86
+ ],
87
+ "mean": [
88
+ 0.37771376967430115,
89
+ -0.333850234746933,
90
+ -0.16629818081855774,
91
+ -2.1338770389556885,
92
+ 0.23977188766002655,
93
+ 0.45303064584732056,
94
+ 1.213043451309204,
95
+ 0.11304953694343567
96
+ ],
97
+ "std": [
98
+ 0.2763381600379944,
99
+ 0.12889380753040314,
100
+ 0.31046074628829956,
101
+ 0.24189542233943892,
102
+ 0.18755820393562317,
103
+ 0.5512157082557678,
104
+ 0.3186982572078705,
105
+ 0.10911875218153
106
+ ],
107
+ "q01": [
108
+ 0.011019254736602318,
109
+ -0.7038246548175812,
110
+ -0.3635774254798889,
111
+ -2.4432670545578,
112
+ -0.385044270157814,
113
+ -0.4924531447887421,
114
+ 0.20742268860340118,
115
+ 0.058291271328926086
116
+ ],
117
+ "q99": [
118
+ 0.817494571208953,
119
+ -0.2223912626504898,
120
+ 1.050566117763519,
121
+ -1.2690210890769968,
122
+ 0.4533466857671731,
123
+ 0.9372382760047913,
124
+ 1.3637149333953857,
125
+ 0.5338253378868103
126
+ ]
127
+ },
128
+ "head": {
129
+ "min": [
130
+ 0.8881748914718628,
131
+ -0.34821364283561707
132
+ ],
133
+ "max": [
134
+ 0.8989127278327942,
135
+ 0.3497476279735565
136
+ ],
137
+ "mean": [
138
+ 0.8951596021652222,
139
+ 0.01902701146900654
140
+ ],
141
+ "std": [
142
+ 0.0015153294661875158,
143
+ 0.16179992258548737
144
+ ],
145
+ "q01": [
146
+ 0.8912428617477417,
147
+ -0.34514567255973816
148
+ ],
149
+ "q99": [
150
+ 0.8973787426948547,
151
+ 0.3466796576976776
152
+ ]
153
+ },
154
+ "lift": {
155
+ "min": [
156
+ -0.16487999260425568
157
+ ],
158
+ "max": [
159
+ -0.08263999968767166
160
+ ],
161
+ "mean": [
162
+ -0.11997351795434952
163
+ ],
164
+ "std": [
165
+ 0.02227330580353737
166
+ ],
167
+ "q01": [
168
+ -0.16436000168323517
169
+ ],
170
+ "q99": [
171
+ -0.08314000070095062
172
+ ]
173
+ },
174
+ "odometry": {
175
+ "min": [
176
+ -0.34160396456718445,
177
+ -0.023111142218112946,
178
+ -0.5214874148368835
179
+ ],
180
+ "max": [
181
+ 0.3456443250179291,
182
+ 0.3591441512107849,
183
+ 0.43391692638397217
184
+ ],
185
+ "mean": [
186
+ 0.014666633680462837,
187
+ 0.004867780953645706,
188
+ -0.03149914741516113
189
+ ],
190
+ "std": [
191
+ 0.07054363191127777,
192
+ 0.03703222796320915,
193
+ 0.11796847730875015
194
+ ],
195
+ "q01": [
196
+ -0.009144261851906775,
197
+ -0.0012328855972737072,
198
+ -0.49353188574314116
199
+ ],
200
+ "q99": [
201
+ 0.3272102481126785,
202
+ 0.29953320503234626,
203
+ 0.008057233486324535
204
+ ]
205
+ }
206
+ },
207
+ "action": {
208
+ "arm_left": {
209
+ "min": [
210
+ -0.424912691116333,
211
+ 0.029145635664463043,
212
+ -1.2501943111419678,
213
+ -2.348524570465088,
214
+ -1.3314952850341797,
215
+ -1.0312517881393433,
216
+ -1.421995997428894,
217
+ 0.0
218
+ ],
219
+ "max": [
220
+ 1.0643813610076904,
221
+ 0.905048668384552,
222
+ 0.587591826915741,
223
+ -0.5951845645904541,
224
+ 0.9004467129707336,
225
+ 1.0753204822540283,
226
+ 1.0339030027389526,
227
+ 1.100000023841858
228
+ ],
229
+ "mean": [
230
+ 0.10058759897947311,
231
+ 0.27536019682884216,
232
+ 0.05905205011367798,
233
+ -1.7770514488220215,
234
+ -0.12516692280769348,
235
+ 0.5716906189918518,
236
+ -0.922753095626831,
237
+ 0.3007752001285553
238
+ ],
239
+ "std": [
240
+ 0.266132652759552,
241
+ 0.10879254341125488,
242
+ 0.3062238395214081,
243
+ 0.29803964495658875,
244
+ 0.36735913157463074,
245
+ 0.38676244020462036,
246
+ 0.6600767970085144,
247
+ 0.3554460406303406
248
+ ],
249
+ "q01": [
250
+ -0.2504376983642578,
251
+ 0.09817477315664291,
252
+ -1.0066902256011963,
253
+ -2.0540003776550293,
254
+ -1.0147589874267577,
255
+ -0.45599114060401913,
256
+ -1.421995997428894,
257
+ 0.04175168164074424
258
+ ],
259
+ "q99": [
260
+ 0.8252816796302795,
261
+ 0.7163690328598022,
262
+ 0.2883884012699127,
263
+ -0.7992039918899536,
264
+ 0.7102330923080444,
265
+ 0.8360075354576111,
266
+ 0.8486902332305883,
267
+ 1.100000023841858
268
+ ]
269
+ },
270
+ "arm_right": {
271
+ "min": [
272
+ -0.13805827498435974,
273
+ -0.832951545715332,
274
+ -0.6220361590385437,
275
+ -2.6798644065856934,
276
+ -0.701783299446106,
277
+ -1.0448626279830933,
278
+ -0.09854990243911743,
279
+ 0.0
280
+ ],
281
+ "max": [
282
+ 1.2550530433654785,
283
+ -0.1649588793516159,
284
+ 1.2271846532821655,
285
+ -1.0722525119781494,
286
+ 0.676485538482666,
287
+ 1.0323690176010132,
288
+ 1.3637045621871948,
289
+ 0.5994641184806824
290
+ ],
291
+ "mean": [
292
+ 0.37771889567375183,
293
+ -0.33395493030548096,
294
+ -0.1662960797548294,
295
+ -2.1338820457458496,
296
+ 0.23977705836296082,
297
+ 0.4530261754989624,
298
+ 1.2130540609359741,
299
+ 0.11309224367141724
300
+ ],
301
+ "std": [
302
+ 0.2764613926410675,
303
+ 0.12893807888031006,
304
+ 0.3105952739715576,
305
+ 0.24212746322154952,
306
+ 0.1876864731311798,
307
+ 0.5513920187950134,
308
+ 0.31870579719543457,
309
+ 0.10958357155323029
310
+ ],
311
+ "q01": [
312
+ 0.01073786523193121,
313
+ -0.7025632262229919,
314
+ -0.3635534346103668,
315
+ -2.443631410598755,
316
+ -0.3837906980514526,
317
+ -0.49240782856941223,
318
+ 0.2070874124765396,
319
+ 0.05612814798951149
320
+ ],
321
+ "q99": [
322
+ 0.8195445752143835,
323
+ -0.22241522371768951,
324
+ 1.0507768392562866,
325
+ -1.2686021327972412,
326
+ 0.45405831933021545,
327
+ 0.9372382760047913,
328
+ 1.3637045621871948,
329
+ 0.5353437662124634
330
+ ]
331
+ },
332
+ "head": {
333
+ "min": [
334
+ 0.6951000094413757,
335
+ -0.3499999940395355
336
+ ],
337
+ "max": [
338
+ 0.6951000094413757,
339
+ 0.3499999940395355
340
+ ],
341
+ "mean": [
342
+ 0.6952548027038574,
343
+ 0.02028741128742695
344
+ ],
345
+ "std": [
346
+ 0.00015479458666108788,
347
+ 0.16366322338581085
348
+ ],
349
+ "q01": [
350
+ 0.6951000094413757,
351
+ -0.3499999940395355
352
+ ],
353
+ "q99": [
354
+ 0.6951000094413757,
355
+ 0.3499999940395355
356
+ ]
357
+ },
358
+ "lift": {
359
+ "min": [
360
+ -0.16487999260425568
361
+ ],
362
+ "max": [
363
+ -0.08263999968767166
364
+ ],
365
+ "mean": [
366
+ -0.11997371912002563
367
+ ],
368
+ "std": [
369
+ 0.022273359820246696
370
+ ],
371
+ "q01": [
372
+ -0.16436000168323517
373
+ ],
374
+ "q99": [
375
+ -0.08314000070095062
376
+ ]
377
+ },
378
+ "odometry": {
379
+ "min": [
380
+ -0.3333333432674408,
381
+ 0.0,
382
+ -0.4951171875
383
+ ],
384
+ "max": [
385
+ 0.330078125,
386
+ 0.3333333432674408,
387
+ 0.5
388
+ ],
389
+ "mean": [
390
+ 0.016142169013619423,
391
+ 0.005697546526789665,
392
+ -0.03227844089269638
393
+ ],
394
+ "std": [
395
+ 0.07646200805902481,
396
+ 0.042645420879125595,
397
+ 0.1212419867515564
398
+ ],
399
+ "q01": [
400
+ 0.0,
401
+ 0.0,
402
+ -0.4951171875
403
+ ],
404
+ "q99": [
405
+ 0.330078125,
406
+ 0.3268229067325592,
407
+ 0.0
408
+ ]
409
+ }
410
+ },
411
+ "relative_action": {}
412
+ }
413
+ }
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d7",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Cosmos-Reason2-2B",
5
+ "backbone_model_type": "qwen",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 0,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": false,
15
+ "backbone_trainable_params_fp32": true,
16
+ "extra_augmentation_config": null,
17
+ "apply_sincos_state_encoding": false,
18
+ "use_percentiles": true,
19
+ "use_relative_action": false,
20
+ "max_state_dim": 132,
21
+ "max_action_dim": 132,
22
+ "action_horizon": 40,
23
+ "hidden_size": 1024,
24
+ "input_embedding_dim": 1536,
25
+ "state_history_length": 1,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.2,
52
+ "exclude_state": false,
53
+ "use_mean_std": false,
54
+ "max_num_embodiments": 32
55
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbcaea5ee88f1e0f1465043920a2647c67e7de17d24adfd1c477742a6168edec
3
+ size 4986649584
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c61334594e211c7e84207f7f266450be4fef76b68894cd46574132495f5d1cf
3
+ size 4970792616
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67c9068698ddafb96135ebf50dbbe0383a8f99b2c1607e5e546d5065841ed0f5
3
+ size 2618758696
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d7Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "real_g1_relative_eef_relative_joints": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -20,
9
+ 0
10
+ ],
11
+ "modality_keys": [
12
+ "ego_view"
13
+ ],
14
+ "sin_cos_embedding_keys": null,
15
+ "mean_std_embedding_keys": null,
16
+ "action_configs": null
17
+ },
18
+ "state": {
19
+ "delta_indices": [
20
+ 0
21
+ ],
22
+ "modality_keys": [
23
+ "left_wrist_eef_9d",
24
+ "right_wrist_eef_9d",
25
+ "left_hand",
26
+ "right_hand",
27
+ "left_arm",
28
+ "right_arm",
29
+ "waist"
30
+ ],
31
+ "sin_cos_embedding_keys": null,
32
+ "mean_std_embedding_keys": null,
33
+ "action_configs": null
34
+ },
35
+ "action": {
36
+ "delta_indices": [
37
+ 0,
38
+ 1,
39
+ 2,
40
+ 3,
41
+ 4,
42
+ 5,
43
+ 6,
44
+ 7,
45
+ 8,
46
+ 9,
47
+ 10,
48
+ 11,
49
+ 12,
50
+ 13,
51
+ 14,
52
+ 15,
53
+ 16,
54
+ 17,
55
+ 18,
56
+ 19,
57
+ 20,
58
+ 21,
59
+ 22,
60
+ 23,
61
+ 24,
62
+ 25,
63
+ 26,
64
+ 27,
65
+ 28,
66
+ 29,
67
+ 30,
68
+ 31,
69
+ 32,
70
+ 33,
71
+ 34,
72
+ 35,
73
+ 36,
74
+ 37,
75
+ 38,
76
+ 39
77
+ ],
78
+ "modality_keys": [
79
+ "left_wrist_eef_9d",
80
+ "right_wrist_eef_9d",
81
+ "left_hand",
82
+ "right_hand",
83
+ "left_arm",
84
+ "right_arm",
85
+ "waist",
86
+ "base_height_command",
87
+ "navigate_command"
88
+ ],
89
+ "sin_cos_embedding_keys": null,
90
+ "mean_std_embedding_keys": null,
91
+ "action_configs": [
92
+ {
93
+ "rep": "RELATIVE",
94
+ "type": "EEF",
95
+ "format": "XYZ_ROT6D",
96
+ "state_key": "left_wrist_eef_9d"
97
+ },
98
+ {
99
+ "rep": "RELATIVE",
100
+ "type": "EEF",
101
+ "format": "XYZ_ROT6D",
102
+ "state_key": "right_wrist_eef_9d"
103
+ },
104
+ {
105
+ "rep": "ABSOLUTE",
106
+ "type": "NON_EEF",
107
+ "format": "DEFAULT",
108
+ "state_key": "left_hand"
109
+ },
110
+ {
111
+ "rep": "ABSOLUTE",
112
+ "type": "NON_EEF",
113
+ "format": "DEFAULT",
114
+ "state_key": "right_hand"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "left_arm"
121
+ },
122
+ {
123
+ "rep": "RELATIVE",
124
+ "type": "NON_EEF",
125
+ "format": "DEFAULT",
126
+ "state_key": "right_arm"
127
+ },
128
+ {
129
+ "rep": "ABSOLUTE",
130
+ "type": "NON_EEF",
131
+ "format": "DEFAULT",
132
+ "state_key": "waist"
133
+ },
134
+ {
135
+ "rep": "ABSOLUTE",
136
+ "type": "NON_EEF",
137
+ "format": "DEFAULT",
138
+ "state_key": "base_height_command"
139
+ },
140
+ {
141
+ "rep": "ABSOLUTE",
142
+ "type": "NON_EEF",
143
+ "format": "DEFAULT",
144
+ "state_key": "navigate_command"
145
+ }
146
+ ]
147
+ },
148
+ "language": {
149
+ "delta_indices": [
150
+ 0
151
+ ],
152
+ "modality_keys": [
153
+ "annotation.human.task_description"
154
+ ],
155
+ "sin_cos_embedding_keys": null,
156
+ "mean_std_embedding_keys": null,
157
+ "action_configs": null
158
+ }
159
+ },
160
+ "real_r1_pro_sharpa_relative_eef_mecka": {
161
+ "video": {
162
+ "delta_indices": [
163
+ -30,
164
+ 0
165
+ ],
166
+ "modality_keys": [
167
+ "ego_view_cropratio_res320x240_freq30"
168
+ ],
169
+ "sin_cos_embedding_keys": null,
170
+ "mean_std_embedding_keys": null,
171
+ "action_configs": null
172
+ },
173
+ "state": {
174
+ "delta_indices": [
175
+ 0
176
+ ],
177
+ "modality_keys": [
178
+ "left_wrist_eef",
179
+ "right_wrist_eef",
180
+ "left_hand_joints",
181
+ "right_hand_joints"
182
+ ],
183
+ "sin_cos_embedding_keys": null,
184
+ "mean_std_embedding_keys": null,
185
+ "action_configs": null
186
+ },
187
+ "action": {
188
+ "delta_indices": [
189
+ 0,
190
+ 1,
191
+ 2,
192
+ 3,
193
+ 4,
194
+ 5,
195
+ 6,
196
+ 7,
197
+ 8,
198
+ 9,
199
+ 10,
200
+ 11,
201
+ 12,
202
+ 13,
203
+ 14,
204
+ 15,
205
+ 16,
206
+ 17,
207
+ 18,
208
+ 19,
209
+ 20,
210
+ 21,
211
+ 22,
212
+ 23,
213
+ 24,
214
+ 25,
215
+ 26,
216
+ 27,
217
+ 28,
218
+ 29,
219
+ 30,
220
+ 31,
221
+ 32,
222
+ 33,
223
+ 34,
224
+ 35,
225
+ 36,
226
+ 37,
227
+ 38,
228
+ 39
229
+ ],
230
+ "modality_keys": [
231
+ "left_wrist_eef",
232
+ "right_wrist_eef",
233
+ "left_hand_joints",
234
+ "right_hand_joints"
235
+ ],
236
+ "sin_cos_embedding_keys": null,
237
+ "mean_std_embedding_keys": null,
238
+ "action_configs": [
239
+ {
240
+ "rep": "RELATIVE",
241
+ "type": "EEF",
242
+ "format": "XYZ_ROT6D",
243
+ "state_key": "left_wrist_eef"
244
+ },
245
+ {
246
+ "rep": "RELATIVE",
247
+ "type": "EEF",
248
+ "format": "XYZ_ROT6D",
249
+ "state_key": "right_wrist_eef"
250
+ },
251
+ {
252
+ "rep": "ABSOLUTE",
253
+ "type": "NON_EEF",
254
+ "format": "DEFAULT",
255
+ "state_key": "left_hand_joints"
256
+ },
257
+ {
258
+ "rep": "ABSOLUTE",
259
+ "type": "NON_EEF",
260
+ "format": "DEFAULT",
261
+ "state_key": "right_hand_joints"
262
+ }
263
+ ]
264
+ },
265
+ "language": {
266
+ "delta_indices": [
267
+ 0
268
+ ],
269
+ "modality_keys": [
270
+ "annotation.human.coarse_action"
271
+ ],
272
+ "sin_cos_embedding_keys": null,
273
+ "mean_std_embedding_keys": null,
274
+ "action_configs": null
275
+ }
276
+ },
277
+ "real_r1_pro_sharpa_relative_eef_human": {
278
+ "video": {
279
+ "delta_indices": [
280
+ -20,
281
+ 0
282
+ ],
283
+ "modality_keys": [
284
+ "ego_view_res320x240_freq20",
285
+ "left_wrist_view_res320x240_freq20",
286
+ "right_wrist_view_res320x240_freq20"
287
+ ],
288
+ "sin_cos_embedding_keys": null,
289
+ "mean_std_embedding_keys": null,
290
+ "action_configs": null
291
+ },
292
+ "state": {
293
+ "delta_indices": [
294
+ 0
295
+ ],
296
+ "modality_keys": [
297
+ "left_wrist_eef",
298
+ "right_wrist_eef",
299
+ "left_hand_joints",
300
+ "right_hand_joints"
301
+ ],
302
+ "sin_cos_embedding_keys": null,
303
+ "mean_std_embedding_keys": null,
304
+ "action_configs": null
305
+ },
306
+ "action": {
307
+ "delta_indices": [
308
+ 0,
309
+ 1,
310
+ 2,
311
+ 3,
312
+ 4,
313
+ 5,
314
+ 6,
315
+ 7,
316
+ 8,
317
+ 9,
318
+ 10,
319
+ 11,
320
+ 12,
321
+ 13,
322
+ 14,
323
+ 15,
324
+ 16,
325
+ 17,
326
+ 18,
327
+ 19,
328
+ 20,
329
+ 21,
330
+ 22,
331
+ 23,
332
+ 24,
333
+ 25,
334
+ 26,
335
+ 27,
336
+ 28,
337
+ 29,
338
+ 30,
339
+ 31,
340
+ 32,
341
+ 33,
342
+ 34,
343
+ 35,
344
+ 36,
345
+ 37,
346
+ 38,
347
+ 39
348
+ ],
349
+ "modality_keys": [
350
+ "left_wrist_eef",
351
+ "right_wrist_eef",
352
+ "left_hand_joints",
353
+ "right_hand_joints"
354
+ ],
355
+ "sin_cos_embedding_keys": null,
356
+ "mean_std_embedding_keys": null,
357
+ "action_configs": [
358
+ {
359
+ "rep": "RELATIVE",
360
+ "type": "EEF",
361
+ "format": "XYZ_ROT6D",
362
+ "state_key": "left_wrist_eef"
363
+ },
364
+ {
365
+ "rep": "RELATIVE",
366
+ "type": "EEF",
367
+ "format": "XYZ_ROT6D",
368
+ "state_key": "right_wrist_eef"
369
+ },
370
+ {
371
+ "rep": "ABSOLUTE",
372
+ "type": "NON_EEF",
373
+ "format": "DEFAULT",
374
+ "state_key": "left_hand_joints"
375
+ },
376
+ {
377
+ "rep": "ABSOLUTE",
378
+ "type": "NON_EEF",
379
+ "format": "DEFAULT",
380
+ "state_key": "right_hand_joints"
381
+ }
382
+ ]
383
+ },
384
+ "language": {
385
+ "delta_indices": [
386
+ 0
387
+ ],
388
+ "modality_keys": [
389
+ "annotation.human.coarse_action"
390
+ ],
391
+ "sin_cos_embedding_keys": null,
392
+ "mean_std_embedding_keys": null,
393
+ "action_configs": null
394
+ }
395
+ },
396
+ "real_r1_pro_sharpa_relative_eef": {
397
+ "video": {
398
+ "delta_indices": [
399
+ -20,
400
+ 0
401
+ ],
402
+ "modality_keys": [
403
+ "ego_view_res320x240_freq20",
404
+ "left_wrist_view_res320x240_freq20",
405
+ "right_wrist_view_res320x240_freq20"
406
+ ],
407
+ "sin_cos_embedding_keys": null,
408
+ "mean_std_embedding_keys": null,
409
+ "action_configs": null
410
+ },
411
+ "state": {
412
+ "delta_indices": [
413
+ 0
414
+ ],
415
+ "modality_keys": [
416
+ "left_wrist_eef",
417
+ "right_wrist_eef",
418
+ "left_hand_joints",
419
+ "right_hand_joints"
420
+ ],
421
+ "sin_cos_embedding_keys": null,
422
+ "mean_std_embedding_keys": null,
423
+ "action_configs": null
424
+ },
425
+ "action": {
426
+ "delta_indices": [
427
+ 0,
428
+ 1,
429
+ 2,
430
+ 3,
431
+ 4,
432
+ 5,
433
+ 6,
434
+ 7,
435
+ 8,
436
+ 9,
437
+ 10,
438
+ 11,
439
+ 12,
440
+ 13,
441
+ 14,
442
+ 15,
443
+ 16,
444
+ 17,
445
+ 18,
446
+ 19,
447
+ 20,
448
+ 21,
449
+ 22,
450
+ 23,
451
+ 24,
452
+ 25,
453
+ 26,
454
+ 27,
455
+ 28,
456
+ 29,
457
+ 30,
458
+ 31,
459
+ 32,
460
+ 33,
461
+ 34,
462
+ 35,
463
+ 36,
464
+ 37,
465
+ 38,
466
+ 39
467
+ ],
468
+ "modality_keys": [
469
+ "left_wrist_eef",
470
+ "right_wrist_eef",
471
+ "left_hand_joints",
472
+ "right_hand_joints"
473
+ ],
474
+ "sin_cos_embedding_keys": null,
475
+ "mean_std_embedding_keys": null,
476
+ "action_configs": [
477
+ {
478
+ "rep": "RELATIVE",
479
+ "type": "EEF",
480
+ "format": "XYZ_ROT6D",
481
+ "state_key": "left_wrist_eef"
482
+ },
483
+ {
484
+ "rep": "RELATIVE",
485
+ "type": "EEF",
486
+ "format": "XYZ_ROT6D",
487
+ "state_key": "right_wrist_eef"
488
+ },
489
+ {
490
+ "rep": "ABSOLUTE",
491
+ "type": "NON_EEF",
492
+ "format": "DEFAULT",
493
+ "state_key": "left_hand_joints"
494
+ },
495
+ {
496
+ "rep": "ABSOLUTE",
497
+ "type": "NON_EEF",
498
+ "format": "DEFAULT",
499
+ "state_key": "right_hand_joints"
500
+ }
501
+ ]
502
+ },
503
+ "language": {
504
+ "delta_indices": [
505
+ 0
506
+ ],
507
+ "modality_keys": [
508
+ "annotation.human.coarse_action"
509
+ ],
510
+ "sin_cos_embedding_keys": null,
511
+ "mean_std_embedding_keys": null,
512
+ "action_configs": null
513
+ }
514
+ },
515
+ "xdof_relative_eef_relative_joint": {
516
+ "video": {
517
+ "delta_indices": [
518
+ -30,
519
+ 0
520
+ ],
521
+ "modality_keys": [
522
+ "top_camera-images-rgb_320_240",
523
+ "left_camera-images-rgb_320_240",
524
+ "right_camera-images-rgb_320_240"
525
+ ],
526
+ "sin_cos_embedding_keys": null,
527
+ "mean_std_embedding_keys": null,
528
+ "action_configs": null
529
+ },
530
+ "state": {
531
+ "delta_indices": [
532
+ 0
533
+ ],
534
+ "modality_keys": [
535
+ "left_wrist_eef",
536
+ "right_wrist_eef",
537
+ "left_gripper_pos",
538
+ "right_gripper_pos",
539
+ "left_joint_pos",
540
+ "right_joint_pos"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": null
545
+ },
546
+ "action": {
547
+ "delta_indices": [
548
+ 0,
549
+ 1,
550
+ 2,
551
+ 3,
552
+ 4,
553
+ 5,
554
+ 6,
555
+ 7,
556
+ 8,
557
+ 9,
558
+ 10,
559
+ 11,
560
+ 12,
561
+ 13,
562
+ 14,
563
+ 15,
564
+ 16,
565
+ 17,
566
+ 18,
567
+ 19,
568
+ 20,
569
+ 21,
570
+ 22,
571
+ 23,
572
+ 24,
573
+ 25,
574
+ 26,
575
+ 27,
576
+ 28,
577
+ 29,
578
+ 30,
579
+ 31,
580
+ 32,
581
+ 33,
582
+ 34,
583
+ 35,
584
+ 36,
585
+ 37,
586
+ 38,
587
+ 39
588
+ ],
589
+ "modality_keys": [
590
+ "left_wrist_eef",
591
+ "right_wrist_eef",
592
+ "left_gripper_pos",
593
+ "right_gripper_pos",
594
+ "left_joint_pos",
595
+ "right_joint_pos"
596
+ ],
597
+ "sin_cos_embedding_keys": null,
598
+ "mean_std_embedding_keys": null,
599
+ "action_configs": [
600
+ {
601
+ "rep": "RELATIVE",
602
+ "type": "EEF",
603
+ "format": "XYZ_ROT6D",
604
+ "state_key": "left_wrist_eef"
605
+ },
606
+ {
607
+ "rep": "RELATIVE",
608
+ "type": "EEF",
609
+ "format": "XYZ_ROT6D",
610
+ "state_key": "right_wrist_eef"
611
+ },
612
+ {
613
+ "rep": "ABSOLUTE",
614
+ "type": "NON_EEF",
615
+ "format": "DEFAULT",
616
+ "state_key": "left_gripper_pos"
617
+ },
618
+ {
619
+ "rep": "ABSOLUTE",
620
+ "type": "NON_EEF",
621
+ "format": "DEFAULT",
622
+ "state_key": "right_gripper_pos"
623
+ },
624
+ {
625
+ "rep": "RELATIVE",
626
+ "type": "NON_EEF",
627
+ "format": "DEFAULT",
628
+ "state_key": "left_joint_pos"
629
+ },
630
+ {
631
+ "rep": "RELATIVE",
632
+ "type": "NON_EEF",
633
+ "format": "DEFAULT",
634
+ "state_key": "right_joint_pos"
635
+ }
636
+ ]
637
+ },
638
+ "language": {
639
+ "delta_indices": [
640
+ 0
641
+ ],
642
+ "modality_keys": [
643
+ "annotation.task"
644
+ ],
645
+ "sin_cos_embedding_keys": null,
646
+ "mean_std_embedding_keys": null,
647
+ "action_configs": null
648
+ }
649
+ },
650
+ "real_r1_pro_sharpa_relative_eef_maxinsights": {
651
+ "video": {
652
+ "delta_indices": [
653
+ -30,
654
+ 0
655
+ ],
656
+ "modality_keys": [
657
+ "ego_view_cropratio_res320x240_freq30"
658
+ ],
659
+ "sin_cos_embedding_keys": null,
660
+ "mean_std_embedding_keys": null,
661
+ "action_configs": null
662
+ },
663
+ "state": {
664
+ "delta_indices": [
665
+ 0
666
+ ],
667
+ "modality_keys": [
668
+ "left_wrist_eef",
669
+ "right_wrist_eef",
670
+ "left_hand_joints",
671
+ "right_hand_joints"
672
+ ],
673
+ "sin_cos_embedding_keys": null,
674
+ "mean_std_embedding_keys": null,
675
+ "action_configs": null
676
+ },
677
+ "action": {
678
+ "delta_indices": [
679
+ 0,
680
+ 1,
681
+ 2,
682
+ 3,
683
+ 4,
684
+ 5,
685
+ 6,
686
+ 7,
687
+ 8,
688
+ 9,
689
+ 10,
690
+ 11,
691
+ 12,
692
+ 13,
693
+ 14,
694
+ 15,
695
+ 16,
696
+ 17,
697
+ 18,
698
+ 19,
699
+ 20,
700
+ 21,
701
+ 22,
702
+ 23,
703
+ 24,
704
+ 25,
705
+ 26,
706
+ 27,
707
+ 28,
708
+ 29,
709
+ 30,
710
+ 31,
711
+ 32,
712
+ 33,
713
+ 34,
714
+ 35,
715
+ 36,
716
+ 37,
717
+ 38,
718
+ 39
719
+ ],
720
+ "modality_keys": [
721
+ "left_wrist_eef",
722
+ "right_wrist_eef",
723
+ "left_hand_joints",
724
+ "right_hand_joints"
725
+ ],
726
+ "sin_cos_embedding_keys": null,
727
+ "mean_std_embedding_keys": null,
728
+ "action_configs": [
729
+ {
730
+ "rep": "RELATIVE",
731
+ "type": "EEF",
732
+ "format": "XYZ_ROT6D",
733
+ "state_key": "left_wrist_eef"
734
+ },
735
+ {
736
+ "rep": "RELATIVE",
737
+ "type": "EEF",
738
+ "format": "XYZ_ROT6D",
739
+ "state_key": "right_wrist_eef"
740
+ },
741
+ {
742
+ "rep": "ABSOLUTE",
743
+ "type": "NON_EEF",
744
+ "format": "DEFAULT",
745
+ "state_key": "left_hand_joints"
746
+ },
747
+ {
748
+ "rep": "ABSOLUTE",
749
+ "type": "NON_EEF",
750
+ "format": "DEFAULT",
751
+ "state_key": "right_hand_joints"
752
+ }
753
+ ]
754
+ },
755
+ "language": {
756
+ "delta_indices": [
757
+ 0
758
+ ],
759
+ "modality_keys": [
760
+ "annotation.human.coarse_action"
761
+ ],
762
+ "sin_cos_embedding_keys": null,
763
+ "mean_std_embedding_keys": null,
764
+ "action_configs": null
765
+ }
766
+ },
767
+ "xdof_relative_eef_relative_joint_subtask": {
768
+ "video": {
769
+ "delta_indices": [
770
+ -30,
771
+ 0
772
+ ],
773
+ "modality_keys": [
774
+ "top_camera-images-rgb_320_240",
775
+ "left_camera-images-rgb_320_240",
776
+ "right_camera-images-rgb_320_240"
777
+ ],
778
+ "sin_cos_embedding_keys": null,
779
+ "mean_std_embedding_keys": null,
780
+ "action_configs": null
781
+ },
782
+ "state": {
783
+ "delta_indices": [
784
+ 0
785
+ ],
786
+ "modality_keys": [
787
+ "left_wrist_eef",
788
+ "right_wrist_eef",
789
+ "left_gripper_pos",
790
+ "right_gripper_pos",
791
+ "left_joint_pos",
792
+ "right_joint_pos"
793
+ ],
794
+ "sin_cos_embedding_keys": null,
795
+ "mean_std_embedding_keys": null,
796
+ "action_configs": null
797
+ },
798
+ "action": {
799
+ "delta_indices": [
800
+ 0,
801
+ 1,
802
+ 2,
803
+ 3,
804
+ 4,
805
+ 5,
806
+ 6,
807
+ 7,
808
+ 8,
809
+ 9,
810
+ 10,
811
+ 11,
812
+ 12,
813
+ 13,
814
+ 14,
815
+ 15,
816
+ 16,
817
+ 17,
818
+ 18,
819
+ 19,
820
+ 20,
821
+ 21,
822
+ 22,
823
+ 23,
824
+ 24,
825
+ 25,
826
+ 26,
827
+ 27,
828
+ 28,
829
+ 29,
830
+ 30,
831
+ 31,
832
+ 32,
833
+ 33,
834
+ 34,
835
+ 35,
836
+ 36,
837
+ 37,
838
+ 38,
839
+ 39
840
+ ],
841
+ "modality_keys": [
842
+ "left_wrist_eef",
843
+ "right_wrist_eef",
844
+ "left_gripper_pos",
845
+ "right_gripper_pos",
846
+ "left_joint_pos",
847
+ "right_joint_pos"
848
+ ],
849
+ "sin_cos_embedding_keys": null,
850
+ "mean_std_embedding_keys": null,
851
+ "action_configs": [
852
+ {
853
+ "rep": "RELATIVE",
854
+ "type": "EEF",
855
+ "format": "XYZ_ROT6D",
856
+ "state_key": "left_wrist_eef"
857
+ },
858
+ {
859
+ "rep": "RELATIVE",
860
+ "type": "EEF",
861
+ "format": "XYZ_ROT6D",
862
+ "state_key": "right_wrist_eef"
863
+ },
864
+ {
865
+ "rep": "ABSOLUTE",
866
+ "type": "NON_EEF",
867
+ "format": "DEFAULT",
868
+ "state_key": "left_gripper_pos"
869
+ },
870
+ {
871
+ "rep": "ABSOLUTE",
872
+ "type": "NON_EEF",
873
+ "format": "DEFAULT",
874
+ "state_key": "right_gripper_pos"
875
+ },
876
+ {
877
+ "rep": "RELATIVE",
878
+ "type": "NON_EEF",
879
+ "format": "DEFAULT",
880
+ "state_key": "left_joint_pos"
881
+ },
882
+ {
883
+ "rep": "RELATIVE",
884
+ "type": "NON_EEF",
885
+ "format": "DEFAULT",
886
+ "state_key": "right_joint_pos"
887
+ }
888
+ ]
889
+ },
890
+ "language": {
891
+ "delta_indices": [
892
+ 0
893
+ ],
894
+ "modality_keys": [
895
+ "annotation.sub_task"
896
+ ],
897
+ "sin_cos_embedding_keys": null,
898
+ "mean_std_embedding_keys": null,
899
+ "action_configs": null
900
+ }
901
+ },
902
+ "oxe_droid_relative_eef_relative_joint": {
903
+ "video": {
904
+ "delta_indices": [
905
+ -15,
906
+ 0
907
+ ],
908
+ "modality_keys": [
909
+ "exterior_image_1_left",
910
+ "wrist_image_left"
911
+ ],
912
+ "sin_cos_embedding_keys": null,
913
+ "mean_std_embedding_keys": null,
914
+ "action_configs": null
915
+ },
916
+ "state": {
917
+ "delta_indices": [
918
+ 0
919
+ ],
920
+ "modality_keys": [
921
+ "eef_9d",
922
+ "gripper_position",
923
+ "joint_position"
924
+ ],
925
+ "sin_cos_embedding_keys": null,
926
+ "mean_std_embedding_keys": null,
927
+ "action_configs": null
928
+ },
929
+ "action": {
930
+ "delta_indices": [
931
+ 0,
932
+ 1,
933
+ 2,
934
+ 3,
935
+ 4,
936
+ 5,
937
+ 6,
938
+ 7,
939
+ 8,
940
+ 9,
941
+ 10,
942
+ 11,
943
+ 12,
944
+ 13,
945
+ 14,
946
+ 15,
947
+ 16,
948
+ 17,
949
+ 18,
950
+ 19,
951
+ 20,
952
+ 21,
953
+ 22,
954
+ 23,
955
+ 24,
956
+ 25,
957
+ 26,
958
+ 27,
959
+ 28,
960
+ 29,
961
+ 30,
962
+ 31,
963
+ 32,
964
+ 33,
965
+ 34,
966
+ 35,
967
+ 36,
968
+ 37,
969
+ 38,
970
+ 39
971
+ ],
972
+ "modality_keys": [
973
+ "eef_9d",
974
+ "gripper_position",
975
+ "joint_position"
976
+ ],
977
+ "sin_cos_embedding_keys": null,
978
+ "mean_std_embedding_keys": null,
979
+ "action_configs": [
980
+ {
981
+ "rep": "RELATIVE",
982
+ "type": "EEF",
983
+ "format": "XYZ_ROT6D",
984
+ "state_key": "eef_9d"
985
+ },
986
+ {
987
+ "rep": "ABSOLUTE",
988
+ "type": "NON_EEF",
989
+ "format": "DEFAULT",
990
+ "state_key": "gripper_position"
991
+ },
992
+ {
993
+ "rep": "RELATIVE",
994
+ "type": "NON_EEF",
995
+ "format": "DEFAULT",
996
+ "state_key": "joint_position"
997
+ }
998
+ ]
999
+ },
1000
+ "language": {
1001
+ "delta_indices": [
1002
+ 0
1003
+ ],
1004
+ "modality_keys": [
1005
+ "annotation.language.language_instruction"
1006
+ ],
1007
+ "sin_cos_embedding_keys": null,
1008
+ "mean_std_embedding_keys": null,
1009
+ "action_configs": null
1010
+ }
1011
+ },
1012
+ "new_embodiment": {
1013
+ "video": {
1014
+ "delta_indices": [
1015
+ 0
1016
+ ],
1017
+ "modality_keys": [
1018
+ "cam_left_head",
1019
+ "cam_left_wrist",
1020
+ "cam_right_wrist"
1021
+ ],
1022
+ "sin_cos_embedding_keys": null,
1023
+ "mean_std_embedding_keys": null,
1024
+ "action_configs": null
1025
+ },
1026
+ "state": {
1027
+ "delta_indices": [
1028
+ 0
1029
+ ],
1030
+ "modality_keys": [
1031
+ "arm_left",
1032
+ "arm_right",
1033
+ "head",
1034
+ "lift",
1035
+ "odometry"
1036
+ ],
1037
+ "sin_cos_embedding_keys": null,
1038
+ "mean_std_embedding_keys": null,
1039
+ "action_configs": null
1040
+ },
1041
+ "action": {
1042
+ "delta_indices": [
1043
+ 0,
1044
+ 1,
1045
+ 2,
1046
+ 3,
1047
+ 4,
1048
+ 5,
1049
+ 6,
1050
+ 7,
1051
+ 8,
1052
+ 9,
1053
+ 10,
1054
+ 11,
1055
+ 12,
1056
+ 13,
1057
+ 14,
1058
+ 15
1059
+ ],
1060
+ "modality_keys": [
1061
+ "arm_left",
1062
+ "arm_right",
1063
+ "head",
1064
+ "lift",
1065
+ "odometry"
1066
+ ],
1067
+ "sin_cos_embedding_keys": null,
1068
+ "mean_std_embedding_keys": null,
1069
+ "action_configs": [
1070
+ {
1071
+ "rep": "ABSOLUTE",
1072
+ "type": "NON_EEF",
1073
+ "format": "DEFAULT",
1074
+ "state_key": null
1075
+ },
1076
+ {
1077
+ "rep": "ABSOLUTE",
1078
+ "type": "NON_EEF",
1079
+ "format": "DEFAULT",
1080
+ "state_key": null
1081
+ },
1082
+ {
1083
+ "rep": "ABSOLUTE",
1084
+ "type": "NON_EEF",
1085
+ "format": "DEFAULT",
1086
+ "state_key": null
1087
+ },
1088
+ {
1089
+ "rep": "ABSOLUTE",
1090
+ "type": "NON_EEF",
1091
+ "format": "DEFAULT",
1092
+ "state_key": null
1093
+ },
1094
+ {
1095
+ "rep": "ABSOLUTE",
1096
+ "type": "NON_EEF",
1097
+ "format": "DEFAULT",
1098
+ "state_key": null
1099
+ }
1100
+ ]
1101
+ },
1102
+ "language": {
1103
+ "delta_indices": [
1104
+ 0
1105
+ ],
1106
+ "modality_keys": [
1107
+ "annotation.human.primitive_instruction"
1108
+ ],
1109
+ "sin_cos_embedding_keys": null,
1110
+ "mean_std_embedding_keys": null,
1111
+ "action_configs": null
1112
+ }
1113
+ }
1114
+ },
1115
+ "image_crop_size": [
1116
+ 230,
1117
+ 230
1118
+ ],
1119
+ "image_target_size": [
1120
+ 256,
1121
+ 256
1122
+ ],
1123
+ "use_albumentations": true,
1124
+ "random_rotation_angle": 0,
1125
+ "color_jitter_params": {
1126
+ "brightness": 0.3,
1127
+ "contrast": 0.4,
1128
+ "saturation": 0.5,
1129
+ "hue": 0.08
1130
+ },
1131
+ "shortest_image_edge": 256,
1132
+ "crop_fraction": 0.95,
1133
+ "letter_box_transform": false,
1134
+ "model_name": "nvidia/Cosmos-Reason2-2B",
1135
+ "model_type": "qwen",
1136
+ "formalize_language": true,
1137
+ "max_state_dim": 132,
1138
+ "max_action_dim": 132,
1139
+ "max_action_horizon": 40,
1140
+ "use_percentiles": true,
1141
+ "use_mean_std": false,
1142
+ "clip_outliers": true,
1143
+ "apply_sincos_state_encoding": false,
1144
+ "use_relative_action": true,
1145
+ "exclude_state": false,
1146
+ "state_dropout_prob": 0.2
1147
+ }
1148
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b19b514e5a7906c7a4a847ef7350601e3e73e7f2aa9c1c487bd28a4bc1b5a0ec
3
+ size 5841
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d7", "run_id": "ffw_sg2_0504_0444"}