Robotics
LeRobot
Safetensors
smolvla
ethanCSL commited on
Commit
809d650
·
verified ·
1 Parent(s): 335db49

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +6 -5
  2. config.json +42 -23
  3. model.safetensors +2 -2
  4. train_config.json +58 -33
README.md CHANGED
@@ -1,21 +1,22 @@
1
  ---
 
2
  datasets: ethanCSL/three_cube_stack
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
- - lerobot
9
- - act
10
  - robotics
 
 
11
  ---
12
 
13
- # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ base_model: lerobot/smolvla_base
3
  datasets: ethanCSL/three_cube_stack
4
  library_name: lerobot
5
  license: apache-2.0
6
+ model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
 
 
9
  - robotics
10
+ - smolvla
11
+ - lerobot
12
  ---
13
 
14
+ # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "type": "act",
3
  "n_obs_steps": 1,
4
  "normalization_mapping": {
5
- "VISUAL": "MEAN_STD",
6
  "STATE": "MEAN_STD",
7
  "ACTION": "MEAN_STD"
8
  },
@@ -53,25 +53,44 @@
53
  "private": null,
54
  "tags": null,
55
  "license": null,
56
- "chunk_size": 100,
57
- "n_action_steps": 100,
58
- "vision_backbone": "resnet18",
59
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
60
- "replace_final_stride_with_dilation": false,
61
- "pre_norm": false,
62
- "dim_model": 512,
63
- "n_heads": 8,
64
- "dim_feedforward": 3200,
65
- "feedforward_activation": "relu",
66
- "n_encoder_layers": 4,
67
- "n_decoder_layers": 1,
68
- "use_vae": true,
69
- "latent_dim": 32,
70
- "n_vae_encoder_layers": 4,
71
- "temporal_ensemble_coeff": null,
72
- "dropout": 0.1,
73
- "kl_weight": 10.0,
74
- "optimizer_lr": 1e-05,
75
- "optimizer_weight_decay": 0.0001,
76
- "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
 
1
  {
2
+ "type": "smolvla",
3
  "n_obs_steps": 1,
4
  "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
  "STATE": "MEAN_STD",
7
  "ACTION": "MEAN_STD"
8
  },
 
53
  "private": null,
54
  "tags": null,
55
  "license": null,
56
+ "chunk_size": 50,
57
+ "n_action_steps": 50,
58
+ "max_state_dim": 32,
59
+ "max_action_dim": 32,
60
+ "resize_imgs_with_padding": [
61
+ 512,
62
+ 512
63
+ ],
64
+ "empty_cameras": 0,
65
+ "adapt_to_pi_aloha": false,
66
+ "use_delta_joint_actions_aloha": false,
67
+ "tokenizer_max_length": 48,
68
+ "num_steps": 10,
69
+ "use_cache": true,
70
+ "freeze_vision_encoder": true,
71
+ "train_expert_only": true,
72
+ "train_state_proj": true,
73
+ "optimizer_lr": 0.0001,
74
+ "optimizer_betas": [
75
+ 0.9,
76
+ 0.95
77
+ ],
78
+ "optimizer_eps": 1e-08,
79
+ "optimizer_weight_decay": 1e-10,
80
+ "optimizer_grad_clip_norm": 10.0,
81
+ "scheduler_warmup_steps": 1000,
82
+ "scheduler_decay_steps": 30000,
83
+ "scheduler_decay_lr": 2.5e-06,
84
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
85
+ "load_vlm_weights": true,
86
+ "add_image_special_tokens": false,
87
+ "attention_mode": "cross_attn",
88
+ "prefix_length": 0,
89
+ "pad_language_to": "max_length",
90
+ "num_expert_layers": 0,
91
+ "num_vlm_layers": 16,
92
+ "self_attn_every_n_layers": 2,
93
+ "expert_width_multiplier": 0.75,
94
+ "min_period": 0.004,
95
+ "max_period": 4.0
96
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0fc3989278990550c7f382a61bdcd48f2bdeea1f2cfb2e0102f42f299575032
3
- size 206717840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa2f2d52203ba3c97dffac2affdc13f33217dd6e4c04dc17c40ca69e14181f9
3
+ size 906713344
train_config.json CHANGED
@@ -66,10 +66,10 @@
66
  },
67
  "env": null,
68
  "policy": {
69
- "type": "act",
70
  "n_obs_steps": 1,
71
  "normalization_mapping": {
72
- "VISUAL": "MEAN_STD",
73
  "STATE": "MEAN_STD",
74
  "ACTION": "MEAN_STD"
75
  },
@@ -120,35 +120,54 @@
120
  "private": null,
121
  "tags": null,
122
  "license": null,
123
- "chunk_size": 100,
124
- "n_action_steps": 100,
125
- "vision_backbone": "resnet18",
126
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
127
- "replace_final_stride_with_dilation": false,
128
- "pre_norm": false,
129
- "dim_model": 512,
130
- "n_heads": 8,
131
- "dim_feedforward": 3200,
132
- "feedforward_activation": "relu",
133
- "n_encoder_layers": 4,
134
- "n_decoder_layers": 1,
135
- "use_vae": true,
136
- "latent_dim": 32,
137
- "n_vae_encoder_layers": 4,
138
- "temporal_ensemble_coeff": null,
139
- "dropout": 0.1,
140
- "kl_weight": 10.0,
141
- "optimizer_lr": 1e-05,
142
- "optimizer_weight_decay": 0.0001,
143
- "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
- "output_dir": "outputs/train/three_cube_stack",
146
- "job_name": "three_cube_stack",
147
  "resume": false,
148
  "seed": 1000,
149
  "num_workers": 4,
150
- "batch_size": 8,
151
- "steps": 100000,
152
  "eval_freq": 20000,
153
  "log_freq": 200,
154
  "save_checkpoint": true,
@@ -156,28 +175,34 @@
156
  "use_policy_training_preset": true,
157
  "optimizer": {
158
  "type": "adamw",
159
- "lr": 1e-05,
160
- "weight_decay": 0.0001,
161
  "grad_clip_norm": 10.0,
162
  "betas": [
163
  0.9,
164
- 0.999
165
  ],
166
  "eps": 1e-08
167
  },
168
- "scheduler": null,
 
 
 
 
 
 
169
  "eval": {
170
  "n_episodes": 50,
171
  "batch_size": 50,
172
  "use_async_envs": false
173
  },
174
  "wandb": {
175
- "enable": true,
176
  "disable_artifact": false,
177
  "project": "lerobot",
178
  "entity": null,
179
  "notes": null,
180
- "run_id": "li2y3p65",
181
  "mode": null
182
  }
183
  }
 
66
  },
67
  "env": null,
68
  "policy": {
69
+ "type": "smolvla",
70
  "n_obs_steps": 1,
71
  "normalization_mapping": {
72
+ "VISUAL": "IDENTITY",
73
  "STATE": "MEAN_STD",
74
  "ACTION": "MEAN_STD"
75
  },
 
120
  "private": null,
121
  "tags": null,
122
  "license": null,
123
+ "chunk_size": 50,
124
+ "n_action_steps": 50,
125
+ "max_state_dim": 32,
126
+ "max_action_dim": 32,
127
+ "resize_imgs_with_padding": [
128
+ 512,
129
+ 512
130
+ ],
131
+ "empty_cameras": 0,
132
+ "adapt_to_pi_aloha": false,
133
+ "use_delta_joint_actions_aloha": false,
134
+ "tokenizer_max_length": 48,
135
+ "num_steps": 10,
136
+ "use_cache": true,
137
+ "freeze_vision_encoder": true,
138
+ "train_expert_only": true,
139
+ "train_state_proj": true,
140
+ "optimizer_lr": 0.0001,
141
+ "optimizer_betas": [
142
+ 0.9,
143
+ 0.95
144
+ ],
145
+ "optimizer_eps": 1e-08,
146
+ "optimizer_weight_decay": 1e-10,
147
+ "optimizer_grad_clip_norm": 10.0,
148
+ "scheduler_warmup_steps": 1000,
149
+ "scheduler_decay_steps": 30000,
150
+ "scheduler_decay_lr": 2.5e-06,
151
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
152
+ "load_vlm_weights": true,
153
+ "add_image_special_tokens": false,
154
+ "attention_mode": "cross_attn",
155
+ "prefix_length": 0,
156
+ "pad_language_to": "max_length",
157
+ "num_expert_layers": 0,
158
+ "num_vlm_layers": 16,
159
+ "self_attn_every_n_layers": 2,
160
+ "expert_width_multiplier": 0.75,
161
+ "min_period": 0.004,
162
+ "max_period": 4.0
163
  },
164
+ "output_dir": "outputs/train/three_cube_stack_smolvla",
165
+ "job_name": "three_cube_stack_smolvla",
166
  "resume": false,
167
  "seed": 1000,
168
  "num_workers": 4,
169
+ "batch_size": 16,
170
+ "steps": 20000,
171
  "eval_freq": 20000,
172
  "log_freq": 200,
173
  "save_checkpoint": true,
 
175
  "use_policy_training_preset": true,
176
  "optimizer": {
177
  "type": "adamw",
178
+ "lr": 0.0001,
179
+ "weight_decay": 1e-10,
180
  "grad_clip_norm": 10.0,
181
  "betas": [
182
  0.9,
183
+ 0.95
184
  ],
185
  "eps": 1e-08
186
  },
187
+ "scheduler": {
188
+ "type": "cosine_decay_with_warmup",
189
+ "num_warmup_steps": 1000,
190
+ "num_decay_steps": 30000,
191
+ "peak_lr": 0.0001,
192
+ "decay_lr": 2.5e-06
193
+ },
194
  "eval": {
195
  "n_episodes": 50,
196
  "batch_size": 50,
197
  "use_async_envs": false
198
  },
199
  "wandb": {
200
+ "enable": false,
201
  "disable_artifact": false,
202
  "project": "lerobot",
203
  "entity": null,
204
  "notes": null,
205
+ "run_id": null,
206
  "mode": null
207
  }
208
  }