Robotics
LeRobot
Safetensors
act
NLTuan commited on
Commit
ad41278
·
verified ·
1 Parent(s): 3384fe3

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +4 -5
  2. config.json +28 -66
  3. model.safetensors +2 -2
  4. train_config.json +34 -81
README.md CHANGED
@@ -1,22 +1,21 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
  datasets: ThavT/red_block_in_tape
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
 
9
  - lerobot
10
  - robotics
11
- - smolvla
12
  ---
13
 
14
- # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
 
2
  datasets: ThavT/red_block_in_tape
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
+ - act
9
  - lerobot
10
  - robotics
 
11
  ---
12
 
13
+ # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "type": "smolvla",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
@@ -8,31 +8,15 @@
8
  6
9
  ]
10
  },
11
- "observation.images.camera1": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
- 256,
16
- 256
17
- ]
18
- },
19
- "observation.images.camera2": {
20
- "type": "VISUAL",
21
- "shape": [
22
- 3,
23
- 256,
24
- 256
25
- ]
26
- },
27
- "observation.images.camera3": {
28
- "type": "VISUAL",
29
- "shape": [
30
- 3,
31
- 256,
32
- 256
33
  ]
34
  },
35
- "observation.images.empty_camera_0": {
36
  "type": "VISUAL",
37
  "shape": [
38
  3,
@@ -57,53 +41,31 @@
57
  "private": null,
58
  "tags": null,
59
  "license": null,
60
- "pretrained_path": "lerobot/smolvla_base",
61
- "chunk_size": 50,
62
- "n_action_steps": 50,
63
  "normalization_mapping": {
64
- "VISUAL": "IDENTITY",
65
  "STATE": "MEAN_STD",
66
  "ACTION": "MEAN_STD"
67
  },
68
- "max_state_dim": 32,
69
- "max_action_dim": 32,
70
- "resize_imgs_with_padding": [
71
- 512,
72
- 512
73
- ],
74
- "empty_cameras": 1,
75
- "adapt_to_pi_aloha": false,
76
- "use_delta_joint_actions_aloha": false,
77
- "tokenizer_max_length": 48,
78
- "num_steps": 10,
79
- "use_cache": true,
80
- "freeze_vision_encoder": true,
81
- "train_expert_only": true,
82
- "train_state_proj": true,
83
- "optimizer_lr": 0.0001,
84
- "optimizer_betas": [
85
- 0.9,
86
- 0.95
87
- ],
88
- "optimizer_eps": 1e-08,
89
- "optimizer_weight_decay": 1e-10,
90
- "optimizer_grad_clip_norm": 10.0,
91
- "scheduler_warmup_steps": 1000,
92
- "scheduler_decay_steps": 30000,
93
- "scheduler_decay_lr": 2.5e-06,
94
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
95
- "load_vlm_weights": true,
96
- "add_image_special_tokens": false,
97
- "attention_mode": "cross_attn",
98
- "prefix_length": 0,
99
- "pad_language_to": "max_length",
100
- "num_expert_layers": 0,
101
- "num_vlm_layers": 16,
102
- "self_attn_every_n_layers": 2,
103
- "expert_width_multiplier": 0.75,
104
- "min_period": 0.004,
105
- "max_period": 4.0,
106
- "rtc_config": null,
107
- "compile_model": false,
108
- "compile_mode": "max-autotune"
109
  }
 
1
  {
2
+ "type": "act",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
 
8
  6
9
  ]
10
  },
11
+ "observation.images.cam_0": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
+ 480,
16
+ 640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ]
18
  },
19
+ "observation.images.cam_1": {
20
  "type": "VISUAL",
21
  "shape": [
22
  3,
 
41
  "private": null,
42
  "tags": null,
43
  "license": null,
44
+ "pretrained_path": null,
45
+ "chunk_size": 100,
46
+ "n_action_steps": 100,
47
  "normalization_mapping": {
48
+ "VISUAL": "MEAN_STD",
49
  "STATE": "MEAN_STD",
50
  "ACTION": "MEAN_STD"
51
  },
52
+ "vision_backbone": "resnet18",
53
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
54
+ "replace_final_stride_with_dilation": false,
55
+ "pre_norm": false,
56
+ "dim_model": 512,
57
+ "n_heads": 8,
58
+ "dim_feedforward": 3200,
59
+ "feedforward_activation": "relu",
60
+ "n_encoder_layers": 4,
61
+ "n_decoder_layers": 1,
62
+ "use_vae": true,
63
+ "latent_dim": 32,
64
+ "n_vae_encoder_layers": 4,
65
+ "temporal_ensemble_coeff": null,
66
+ "dropout": 0.1,
67
+ "kl_weight": 10.0,
68
+ "optimizer_lr": 1e-05,
69
+ "optimizer_weight_decay": 0.0001,
70
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3f26002ff1b0f42cc9392fa6604471fdbefcc425694617556f51d2e75904bed
3
- size 906712520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82b9741edd0f5c3b5ed1eb6e2c68792a435fe3dc21b3a69293ce158ed9dff73b
3
+ size 206699736
train_config.json CHANGED
@@ -81,7 +81,7 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "smolvla",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
@@ -90,31 +90,15 @@
90
  6
91
  ]
92
  },
93
- "observation.images.camera1": {
94
  "type": "VISUAL",
95
  "shape": [
96
  3,
97
- 256,
98
- 256
99
- ]
100
- },
101
- "observation.images.camera2": {
102
- "type": "VISUAL",
103
- "shape": [
104
- 3,
105
- 256,
106
- 256
107
- ]
108
- },
109
- "observation.images.camera3": {
110
- "type": "VISUAL",
111
- "shape": [
112
- 3,
113
- 256,
114
- 256
115
  ]
116
  },
117
- "observation.images.empty_camera_0": {
118
  "type": "VISUAL",
119
  "shape": [
120
  3,
@@ -139,55 +123,33 @@
139
  "private": null,
140
  "tags": null,
141
  "license": null,
142
- "pretrained_path": "lerobot/smolvla_base",
143
- "chunk_size": 50,
144
- "n_action_steps": 50,
145
  "normalization_mapping": {
146
- "VISUAL": "IDENTITY",
147
  "STATE": "MEAN_STD",
148
  "ACTION": "MEAN_STD"
149
  },
150
- "max_state_dim": 32,
151
- "max_action_dim": 32,
152
- "resize_imgs_with_padding": [
153
- 512,
154
- 512
155
- ],
156
- "empty_cameras": 1,
157
- "adapt_to_pi_aloha": false,
158
- "use_delta_joint_actions_aloha": false,
159
- "tokenizer_max_length": 48,
160
- "num_steps": 10,
161
- "use_cache": true,
162
- "freeze_vision_encoder": true,
163
- "train_expert_only": true,
164
- "train_state_proj": true,
165
- "optimizer_lr": 0.0001,
166
- "optimizer_betas": [
167
- 0.9,
168
- 0.95
169
- ],
170
- "optimizer_eps": 1e-08,
171
- "optimizer_weight_decay": 1e-10,
172
- "optimizer_grad_clip_norm": 10.0,
173
- "scheduler_warmup_steps": 1000,
174
- "scheduler_decay_steps": 30000,
175
- "scheduler_decay_lr": 2.5e-06,
176
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
177
- "load_vlm_weights": true,
178
- "add_image_special_tokens": false,
179
- "attention_mode": "cross_attn",
180
- "prefix_length": 0,
181
- "pad_language_to": "max_length",
182
- "num_expert_layers": 0,
183
- "num_vlm_layers": 16,
184
- "self_attn_every_n_layers": 2,
185
- "expert_width_multiplier": 0.75,
186
- "min_period": 0.004,
187
- "max_period": 4.0,
188
- "rtc_config": null,
189
- "compile_model": false,
190
- "compile_mode": "max-autotune"
191
  },
192
  "output_dir": "outputs/train/red_block_in_tape",
193
  "job_name": "smolvla_red_block_training",
@@ -205,22 +167,16 @@
205
  "use_policy_training_preset": true,
206
  "optimizer": {
207
  "type": "adamw",
208
- "lr": 0.0001,
209
- "weight_decay": 1e-10,
210
  "grad_clip_norm": 10.0,
211
  "betas": [
212
  0.9,
213
- 0.95
214
  ],
215
  "eps": 1e-08
216
  },
217
- "scheduler": {
218
- "type": "cosine_decay_with_warmup",
219
- "num_warmup_steps": 1000,
220
- "num_decay_steps": 30000,
221
- "peak_lr": 0.0001,
222
- "decay_lr": 2.5e-06
223
- },
224
  "eval": {
225
  "n_episodes": 50,
226
  "batch_size": 50,
@@ -232,7 +188,7 @@
232
  "project": "lerobot_red_block",
233
  "entity": null,
234
  "notes": null,
235
- "run_id": "tl6jkmrr",
236
  "mode": null,
237
  "add_tags": true
238
  },
@@ -242,9 +198,6 @@
242
  "rabc_kappa": 0.01,
243
  "rabc_epsilon": 1e-06,
244
  "rabc_head_mode": "sparse",
245
- "rename_map": {
246
- "observation.images.cam_0": "observation.images.camera1",
247
- "observation.images.cam_1": "observation.images.camera2"
248
- },
249
  "checkpoint_path": null
250
  }
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "act",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
 
90
  6
91
  ]
92
  },
93
+ "observation.images.cam_0": {
94
  "type": "VISUAL",
95
  "shape": [
96
  3,
97
+ 480,
98
+ 640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  ]
100
  },
101
+ "observation.images.cam_1": {
102
  "type": "VISUAL",
103
  "shape": [
104
  3,
 
123
  "private": null,
124
  "tags": null,
125
  "license": null,
126
+ "pretrained_path": null,
127
+ "chunk_size": 100,
128
+ "n_action_steps": 100,
129
  "normalization_mapping": {
130
+ "VISUAL": "MEAN_STD",
131
  "STATE": "MEAN_STD",
132
  "ACTION": "MEAN_STD"
133
  },
134
+ "vision_backbone": "resnet18",
135
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
136
+ "replace_final_stride_with_dilation": false,
137
+ "pre_norm": false,
138
+ "dim_model": 512,
139
+ "n_heads": 8,
140
+ "dim_feedforward": 3200,
141
+ "feedforward_activation": "relu",
142
+ "n_encoder_layers": 4,
143
+ "n_decoder_layers": 1,
144
+ "use_vae": true,
145
+ "latent_dim": 32,
146
+ "n_vae_encoder_layers": 4,
147
+ "temporal_ensemble_coeff": null,
148
+ "dropout": 0.1,
149
+ "kl_weight": 10.0,
150
+ "optimizer_lr": 1e-05,
151
+ "optimizer_weight_decay": 0.0001,
152
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  },
154
  "output_dir": "outputs/train/red_block_in_tape",
155
  "job_name": "smolvla_red_block_training",
 
167
  "use_policy_training_preset": true,
168
  "optimizer": {
169
  "type": "adamw",
170
+ "lr": 1e-05,
171
+ "weight_decay": 0.0001,
172
  "grad_clip_norm": 10.0,
173
  "betas": [
174
  0.9,
175
+ 0.999
176
  ],
177
  "eps": 1e-08
178
  },
179
+ "scheduler": null,
 
 
 
 
 
 
180
  "eval": {
181
  "n_episodes": 50,
182
  "batch_size": 50,
 
188
  "project": "lerobot_red_block",
189
  "entity": null,
190
  "notes": null,
191
+ "run_id": "26obbwcp",
192
  "mode": null,
193
  "add_tags": true
194
  },
 
198
  "rabc_kappa": 0.01,
199
  "rabc_epsilon": 1e-06,
200
  "rabc_head_mode": "sparse",
201
+ "rename_map": {},
 
 
 
202
  "checkpoint_path": null
203
  }