ChenYi99 commited on
Commit
2e054a8
1 Parent(s): c23bf71

Upload 12 files

Browse files
latent_motion_tokenizer_trained_on_calvin/config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: latent_motion_tokenizer.src.models.latent_motion_tokenizer.LatentMotionTokenizer
2
+ codebook_dim: 32
3
+ commit_loss_w: 1.0
4
+ recon_loss_w: 1.0
5
+ perceptual_loss_w: 1.0
6
+
7
+ image_encoder:
8
+ _target_: transformers.ViTMAEModel.from_pretrained
9
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
10
+
11
+ m_former:
12
+ _target_: latent_motion_tokenizer.src.models.m_former.MFormer
13
+ add_pooling_layer: false
14
+ config:
15
+ _target_: transformers.ViTConfig
16
+ query_num: 8
17
+ input_hidden_size: 1024
18
+ num_patches: 197 # include the [CLS] token
19
+ attention_probs_dropout_prob: 0.0
20
+ hidden_act: "gelu"
21
+ hidden_dropout_prob: 0.0
22
+ hidden_size: 768 # the hidden size of MAE decoder is 512
23
+ initializer_range: 0.02
24
+ intermediate_size: 3072
25
+ layer_norm_eps: 1e-12
26
+ model_type: "vit"
27
+ num_attention_heads: 12
28
+ num_hidden_layers: 4
29
+ qkv_bias: true
30
+
31
+ vector_quantizer:
32
+ _target_: latent_motion_tokenizer.src.models.vector_quantizer.VectorQuantizer2
33
+ n_e: 128
34
+ e_dim: 32
35
+ beta: 0.25
36
+ remap: null
37
+ sane_index_shape: true
38
+
39
+ decoder:
40
+ _target_: latent_motion_tokenizer.src.models.latent_motion_decoder.LatentMotionDecoder
41
+ config:
42
+ _target_: transformers.ViTConfig
43
+ query_num: 8
44
+ attention_probs_dropout_prob: 0.0
45
+ hidden_act: "gelu"
46
+ hidden_dropout_prob: 0.0
47
+ hidden_size: 768
48
+ image_size: 224
49
+ initializer_range: 0.02
50
+ intermediate_size: 3072
51
+ layer_norm_eps: 1e-12
52
+ model_type: "vit"
53
+ num_attention_heads: 12
54
+ num_channels: 3
55
+ num_hidden_layers: 12
56
+ patch_size: 16
57
+ qkv_bias: true
58
+ encoder_stride: 16
59
+ num_patches: 196
latent_motion_tokenizer_trained_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b7ed635296ad798bef76381b46467f17d784e4041df5c8dbe23f152aed16dbd
3
+ size 484939570
latent_motion_tokenizer_trained_on_oxe/config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: latent_motion_tokenizer.src.models.latent_motion_tokenizer.LatentMotionTokenizer
2
+ codebook_dim: 32
3
+ commit_loss_w: 1.0
4
+ recon_loss_w: 1.0
5
+ perceptual_loss_w: 1.0
6
+
7
+ image_encoder:
8
+ _target_: transformers.ViTMAEModel.from_pretrained
9
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
10
+
11
+ m_former:
12
+ _target_: latent_motion_tokenizer.src.models.m_former.MFormer
13
+ add_pooling_layer: false
14
+ config:
15
+ _target_: transformers.ViTConfig
16
+ query_num: 8
17
+ input_hidden_size: 1024
18
+ num_patches: 197 # include the [CLS] token
19
+ attention_probs_dropout_prob: 0.0
20
+ hidden_act: "gelu"
21
+ hidden_dropout_prob: 0.0
22
+ hidden_size: 768 # the hidden size of MAE decoder is 512
23
+ initializer_range: 0.02
24
+ intermediate_size: 3072
25
+ layer_norm_eps: 1e-12
26
+ model_type: "vit"
27
+ num_attention_heads: 12
28
+ num_hidden_layers: 4
29
+ qkv_bias: true
30
+
31
+ vector_quantizer:
32
+ _target_: latent_motion_tokenizer.src.models.vector_quantizer.VectorQuantizer2
33
+ n_e: 128
34
+ e_dim: 32
35
+ beta: 0.25
36
+ remap: null
37
+ sane_index_shape: true
38
+
39
+ decoder:
40
+ _target_: latent_motion_tokenizer.src.models.latent_motion_decoder.LatentMotionDecoder
41
+ config:
42
+ _target_: transformers.ViTConfig
43
+ query_num: 8
44
+ attention_probs_dropout_prob: 0.0
45
+ hidden_act: "gelu"
46
+ hidden_dropout_prob: 0.0
47
+ hidden_size: 768
48
+ image_size: 224
49
+ initializer_range: 0.02
50
+ intermediate_size: 3072
51
+ layer_norm_eps: 1e-12
52
+ model_type: "vit"
53
+ num_attention_heads: 12
54
+ num_channels: 3
55
+ num_hidden_layers: 12
56
+ patch_size: 16
57
+ qkv_bias: true
58
+ encoder_stride: 16
59
+ num_patches: 196
latent_motion_tokenizer_trained_on_oxe/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1962641b1fbf77f1d3a563f362b2206a9f07ed24e8f6fa09d55d5500c0f0245b
3
+ size 484939570
moto_gpt_finetuned_on_calvin/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: moto_gpt.src.models.moto_gpt.MotoGPT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: "t5-base"
5
+ model_vision:
6
+ _target_: moto_gpt.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
9
+ model_causal_transformer:
10
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: "relu"
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 5
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: true
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
moto_gpt_finetuned_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b34cb739c2db5163942afb5251c30210b9ff3cca1e47154cb763df69f37bd16
3
+ size 364148174
moto_gpt_finetuned_on_rt1/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: moto_gpt.src.models.moto_gpt.MotoGPT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: "t5-base"
5
+ model_vision:
6
+ _target_: moto_gpt.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
9
+ model_causal_transformer:
10
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: "relu"
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 3
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: true
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
moto_gpt_finetuned_on_rt1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f969860566c2748b470febe89c90a2354a57aa1b0b54315bf75c07b90e1defb7
3
+ size 364135886
moto_gpt_pretrained_on_calvin/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: moto_gpt.src.models.moto_gpt.MotoGPT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: "t5-base"
5
+ model_vision:
6
+ _target_: moto_gpt.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
9
+ model_causal_transformer:
10
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: "relu"
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 5
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: false
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
moto_gpt_pretrained_on_calvin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b74fae7c2791171560a005b6f9fc292ba8bccacebfd37ad28e000b990df3f351
3
+ size 364113734
moto_gpt_pretrained_on_oxe/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: moto_gpt.src.models.moto_gpt.MotoGPT
2
+ model_lang:
3
+ _target_: transformers.T5EncoderModel.from_pretrained
4
+ pretrained_model_name_or_path: "t5-base"
5
+ model_vision:
6
+ _target_: moto_gpt.src.models.mae_model.MaeEncoder
7
+ use_obs_feature: true
8
+ pretrained_model_name_or_path: "facebook/vit-mae-large"
9
+ model_causal_transformer:
10
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Model
11
+ config:
12
+ _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Config
13
+ vocab_size: 1
14
+ n_embd: 768
15
+ n_layer: 12
16
+ n_head: 12
17
+ activation_function: "relu"
18
+ dropout: 0.1
19
+ n_positions: 1024
20
+ act_dim: 7
21
+ hidden_size: 768
22
+ sequence_length: 2
23
+ chunk_size: 3
24
+ per_latent_motion_len: 8
25
+ latent_motion_codebook_size: 128
26
+ latent_motion_pred: true
27
+ act_pred: false
28
+ img_feat_dim: 1024
29
+ patch_feat_dim: 1024
30
+ lang_feat_dim: 768
31
+ mask_latent_motion_probability: 0.5
32
+ freeze_lang: true
33
+ freeze_vision: true
moto_gpt_pretrained_on_oxe/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13076acf0490772a8c1bc681cb19f6c58b3434a88a2b849da16c3f196462c02c
3
+ size 364113734