Robotics
Safetensors
Gr00tN1d6
HuFY-NV youliangt commited on
Commit
d0814e7
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files

Co-authored-by: youliangt <youliangt@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
EXPLAINABILITY.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Explainability**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Intended Domain:| Open foundation model for generalized humanoid robot reasoning and skills.|
6
+ |Model Type: |Robot VLA model|
7
+ |Intended Users:|This model is intended for developers and community that build and finetune robot foundation models.|
8
+ |Output:|The model outputs are actions, and the units are floating points. This is referred to as "robot action policy." Actions consist of continuous-value vectors that correspond to different motor controls on a robot.|
9
+ |Describe how the model works:|Accepts vision, language and proprioception, outputs robot action policy.|
10
+ |Technical Limitations & Mitigation:| This model is not tested or intended for use in mission critical applications that require functional safety. The use of the model in those applications is at the user's own risk and sole responsibility, including taking the necessary steps to add needed guardrails or safety mechanisms prior to deployment.<br><br>Risk: Model underperformance in highly dynamic environments with varying robot surroundings (e.g. furniture, objects, etc) and lighting conditions.<br>Mitigation: Enhance dataset with dynamic obstacle scenarios and fine-tune models accordingly.<br><br>Risk: Integration challenges in specific customer environments with varying robot surroundings (e.g. furniture, objects, etc) and lighting conditions.<br>Mitigation: Provide detailed integration guides and support, leveraging NVIDIA's ecosystem.<br><br>Risk: Limited initial support for certain robot embodiments.<br>Mitigation: Expand testing and validation across a wider range of robot platforms.|
11
+ |Verified to have met prescribed quality standards?|Yes|
12
+ |Performance Metrics:|Success rate, as well as the following:<br>1) if the trajectory is smooth and does not jitter<br>2) if the robot does not hit any other objects<br>3) if the trajectory is natural|
13
+ |Potential Known Risks:|This model is not tested or intended for use in mission critical applications that require functional safety. The use of the model in those applications is at the user's own risk and sole responsibility, including taking the necessary steps to add needed guardrails or safety mechanisms prior to deployment.|
14
+ |End User License Agreement:| Your use of this model is governed by the [NSCL V1 License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsIm5jaWQiOiJzby15b3V0LTg3MTcwMS12dDQ4In0=).|
LICENSE ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NVIDIA License
2
+ 1. Definitions
3
+ “Licensor” means any person or entity that distributes its Work.
4
+ “Work” means (a) the original work of authorship made available under this license,
5
+ which may include software, documentation, or other files, and (b) any additions to or
6
+ derivative works thereof that are made available under this license.
7
+ The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the
8
+ meaning as provided under U.S. copyright law; provided, however, that for the purposes
9
+ of this license, derivative works shall not include works that remain separable from, or
10
+ merely link (or bind by name) to the interfaces of, the Work.
11
+ Works are “made available” under this license by including in or with the Work either (a)
12
+ a copyright notice referencing the applicability of this license to the Work, or (b) a copy
13
+ of this license.
14
+ 2. License Grant
15
+ 2.1 Copyright Grant. Subject to the terms and conditions of this license, each
16
+ Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free,
17
+ copyright license to use, reproduce, prepare derivative works of, publicly display,
18
+ publicly perform, sublicense and distribute its Work and any resulting derivative
19
+ works in any form.
20
+ 3. Limitations
21
+ 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so
22
+ under this license, (b) you include a complete copy of this license with your
23
+ distribution, and (c) you retain without modification any copyright, patent,
24
+ trademark, or attribution notices that are present in the Work.
25
+ 3.2 Derivative Works. You may specify that additional or different terms apply to
26
+ the use, reproduction, and distribution of your derivative works of the Work (“Your
27
+ Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3
28
+ applies to your derivative works, and (b) you identify the specific derivative works
29
+ that are subject to Your Terms. Notwithstanding Your Terms, this license (including
30
+ the redistribution requirements in Section 3.1) will continue to apply to the Work
31
+ itself.
32
+ 3.3 Use Limitation. The Work and any derivative works thereof only may be used
33
+ or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA
34
+ Corporation and its affiliates may use the Work and any derivative works
35
+ commercially. As used herein, “non-commercially” means for research or
36
+ evaluation purposes only.
37
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any
38
+ Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce
39
+ any patents that you allege are infringed by any Work, then your rights under this
40
+ license from such Licensor (including the grant in Section 2.1) will terminate
41
+ immediately.
42
+ 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its
43
+ affiliates’ names, logos, or trademarks, except as necessary to reproduce the
44
+ notices described in this license.
45
+ 3.6 Termination. If you violate any term of this license, then your rights under this
46
+ license (including the grant in Section 2.1) will terminate immediately.
47
+ 4. Disclaimer of Warranty.
48
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
49
+ EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
50
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-
51
+ INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS
52
+ LICENSE.
53
+ 5. Limitation of Liability.
54
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
55
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
56
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
57
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR
58
+ RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT
59
+ NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
60
+ DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES),
61
+ EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
PRIVACY.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Privacy**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Generatable or reverse engineerable personal data?|None|
6
+ |Personal data used to create this model?|No|
7
+ |How often is dataset reviewed?|Before Release|
8
+ |Is there provenance for all datasets used in training?|Yes|
9
+ |Does data labeling (annotation, metadata) comply with privacy laws?|Yes|
10
+ |Is data compliant with data subject requests for data correction or removal, if such a request was made?|Yes|
11
+ |Applicable NVIDIA Privacy Policy|https://www.nvidia.com/en-us/about-nvidia/privacy-policy/|
README.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim
4
+ tags:
5
+ - robotics
6
+ ---
7
+
8
+ <div align="center">
9
+ <a href="https://github.com/NVIDIA/Isaac-GR00T">
10
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/67b8da81d01134f89899b4a7/8bFQa2ZIGCsOQQ2ho2N_U.png">
11
+ </a>
12
+ <div align="center">
13
+ <a href="https://github.com/NVIDIA/Isaac-GR00T">
14
+ <img src="https://img.shields.io/badge/GitHub-grey?logo=GitHub" alt="GitHub Badge">
15
+ </a>
16
+ <a href="https://developer.nvidia.com/isaac/gr00t">
17
+ <img src="https://img.shields.io/badge/Website-green" alt="Website Badge">
18
+ </a>
19
+ <!-- <a href=""">
20
+ <img src="https://img.shields.io/badge/Project%20Page-blue?style=plastic" alt="Project Page Badge">
21
+ </a>
22
+ <a href="">
23
+ <img src="https://img.shields.io/badge/Research_Blog-black?style=flat" alt="Research Blog Badge">
24
+ </a>
25
+ <a href="">
26
+ <img src="https://img.shields.io/badge/Dataset-Overview-brightgreen?logo=googleforms" alt="Research Blog Badge">
27
+ </a>
28
+ -->
29
+ </div>
30
+ </div>
31
+
32
+ # GR00T-N1.6-3B
33
+
34
+ <p align="center">
35
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/67b8da81d01134f89899b4a7/ZCLLXZk2LQBG0YH_BmiIN.gif"
36
+ style="width:100%; max-width:1000px; height:auto;">
37
+ </p>
38
+
39
+ ## Description:
40
+
41
+ NVIDIA Isaac GR00T N1.6 is an open vision-language-action (VLA) model for generalized humanoid robot skills. This cross-embodiment model takes multimodal input, including language and images, to perform manipulation tasks in diverse environments.
42
+ GR00T N1.6 is trained on a diverse mixture of robot data including bimanual, semi-humanoid and an expansive humanoid dataset, consisting of real captured data, synthetic data generated using the components of NVIDIA Isaac GR00T Blueprint. It is adaptable through post-training for specific embodiments, tasks and environments.
43
+
44
+ The neural network architecture of GR00T N1.6 is a combination of vision-language foundation model and diffusion transformer head that denoises continuous actions.
45
+
46
+ ## License/Terms of Use
47
+ [Nvidia License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsIm5jaWQiOiJzby15b3V0LTg3MTcwMS12dDQ4In0=)<br>
48
+ You are responsible for ensuring that your use of NVIDIA AI Foundation Models complies with all applicable laws. <br>
49
+
50
+ ### Deployment Geography:
51
+ Global
52
+
53
+ ### Use Case:
54
+ Researchers, Academics, Open-Source Community: AI-driven robotics research and algorithm development.
55
+ Developers: Integrate and customize AI for various robotic applications.
56
+ Startups & Companies: Accelerate robotics development and reduce training costs.
57
+
58
+ ## Reference(s):
59
+ Eagle VLM: Chen, Guo, et al. "Eagle 2.5: Boosting Long-Context Post-Training for Frontier Vision-Language Models." arXiv:2504.15271 (2025).<br>
60
+ Liu, Xingchao, and Chengyue Gong. "Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow." The Eleventh International Conference on Learning Representations”.<br>
61
+ Flow Matching Policy:
62
+ Black, Kevin, et al. "π0: A Vision-Language-Action Flow Model for General Robot Control." arXiv preprint arXiv:2410.24164 (2024).<br>
63
+
64
+ ## Model Architecture:
65
+ **Architecture Type:** Vision Transformer, Multilayer Perceptron, Flow matching Transformer
66
+
67
+ GR00T N1.6 uses vision and text transformers to encode the robot's image observations and text instructions. The architecture handles a varying number of views per embodiment by concatenating image token embeddings from all frames into a sequence, followed by language token embeddings.
68
+
69
+ To model proprioception and a sequence of actions conditioned on observations, GR00T N1.6 uses a flow matching transformer. The flow matching transformer interleaves self-attention over proprioception and actions with cross-attention to the vision and language embeddings. During training, the input actions are corrupted by randomly interpolating between the clean action vector and a gaussian noise vector. At inference time, the policy first samples a gaussian noise vector and iteratively reconstructs a continuous-value action using its velocity prediction.
70
+
71
+ In GR00T N1.6, the MLP connector between the vision-language features and the diffusion-transformer (DiT) has been modified for improved performance on our sim benchmarks. Also, it was trained jointly with flow matching and world-modeling objectives.
72
+
73
+ **Network Architecture:**
74
+ ![image/png](https://github.com/NVIDIA/Isaac-GR00T/blob/main/media/model-architecture.png?raw=true)
75
+ The schematic diagram is shown in the illustration above.
76
+ Red, Green, Blue (RGB) camera frames are processed through a pre-trained vision transformer (SigLip2).
77
+ Text is encoded by a pre-trained transformer (T5)
78
+ Robot proprioception is encoded using a multi-layer perceptron (MLP) indexed by the embodiment ID. To handle variable-dimension proprio, inputs are padded to a configurable max length before feeding into the MLP.
79
+ Actions are encoded and velocity predictions decoded by an MLP, one per unique embodiment.
80
+ The flow matching transformer is implemented as a diffusion transformer (DiT), in which the diffusion step conditioning is implemented using adaptive layernorm (AdaLN).
81
+
82
+ ## Input:
83
+ **Input Type:**
84
+ * Vision: Image Frames<br>
85
+ * State: Robot Proprioception<br>
86
+ * Language Instruction: Text<br>
87
+
88
+ **Input Format:**
89
+ * Vision: Variable number of image frames from robot cameras<br>
90
+ * State: Floating Point<br>
91
+ * Language Instruction: String<br>
92
+
93
+ **Input Parameters:**
94
+ * Vision: 2D - RGB image, any resolution<br>
95
+ * State: 1D - Floating number vector<br>
96
+ * Language Instruction: 1D - String<br>
97
+
98
+ ## Output:
99
+ **Output Type(s):** Actions<br>
100
+ **Output Format** Continuous-value vectors<br>
101
+ **Output Parameters:** [Two-Dimensional (2D)] <br>
102
+ **Other Properties Related to Output:** Continuous-value vectors correspond to different motor controls on a robot, which depends on Degrees of Freedom of the robot embodiment.
103
+
104
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions. <br>
105
+
106
+ ## Software Integration:
107
+ **Runtime Engine(s):** PyTorch
108
+
109
+ **Supported Hardware Microarchitecture Compatibility:**
110
+ All of the below:
111
+ * NVIDIA Ampere
112
+ * NVIDIA Blackwell
113
+ * NVIDIA Jetson
114
+ * NVIDIA Hopper
115
+ * NVIDIA Lovelace
116
+
117
+ **[Preferred/Supported] Operating System(s):**
118
+ * Linux
119
+
120
+ ## Model Version(s):
121
+ Version 1.6
122
+
123
+ ## Ethical Considerations:
124
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
125
+
126
+ For more detailed information on ethical considerations for this model, please see the Model Card++ [Explainability](https://huggingface.co/nvidia/GR00T-N1.6-3B/blob/main/EXPLAINABILITY.md), [Bias](https://huggingface.co/nvidia/GR00T-N1.6-3B/blob/main/BIAS.md), [Safety & Security](https://huggingface.co/nvidia/GR00T-N1.6-3B/blob/main/SAFETY_and_SECURITY.md)), and [Privacy](https://huggingface.co/nvidia/GR00T-N1.6-3B/blob/main/PRIVACY.md) Subcards.
127
+
128
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
129
+
130
+ ## Resources
131
+
132
+ * Previous Version: https://huggingface.co/nvidia/GR00T-N1.5-3B<br>
133
+ * Research Blog: https://research.nvidia.com/labs/gear/gr00t-n1_6/
134
+ * GR00T Website: https://developer.nvidia.com/isaac/gr00t
SAFETY_and_SECURITY.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # **Safety & Security**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Model Application(s):|Machinery and Robotics<br>Robot VLA - single-arm manipulation, bimanual grippers, bi-manual dex hands manipulation and humanoid dexterous manipulation|
6
+ |Describe life critical application (if present):|This model is not tested or intended for use in mission critical applications that require functional safety. The use of the model in those applications is at the user's own risk and sole responsibility, including taking the necessary steps to add needed guardrails or safety mechanisms prior to deployment.|
7
+ |Use Case Restrictions:|Abide by the [NSCL V1 License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsIm5jaWQiOiJzby15b3V0LTg3MTcwMS12dDQ4In0=)|
8
+ |Model and Dataset Restrictions:|The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to.|
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "use_albumentations_transforms": true,
65
+ "use_alternate_vl_dit": true,
66
+ "use_flash_attention": true,
67
+ "use_relative_action": true,
68
+ "use_vlln": true
69
+ }
embodiment_id.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2
9
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9710d31d331b79cfa229fd23605ba9ad47e207cb0f4c7722fe0bacdc666c8326
3
+ size 4991091456
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb79bbd7893068897cda18c86ed91064e92e20e356dea97bc398bf9c8bf2fa35
3
+ size 1582283096
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ }
354
+ },
355
+ "image_crop_size": null,
356
+ "image_target_size": null,
357
+ "use_albumentations": true,
358
+ "random_rotation_angle": null,
359
+ "color_jitter_params": {
360
+ "brightness": 0.3,
361
+ "contrast": 0.4,
362
+ "saturation": 0.5,
363
+ "hue": 0.08
364
+ },
365
+ "shortest_image_edge": 256,
366
+ "crop_fraction": 0.95,
367
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
368
+ "model_type": "eagle",
369
+ "formalize_language": true,
370
+ "max_state_dim": 128,
371
+ "max_action_dim": 128,
372
+ "max_action_horizon": 50,
373
+ "use_percentiles": false,
374
+ "clip_outliers": true,
375
+ "apply_sincos_state_encoding": true,
376
+ "use_relative_action": true
377
+ }
378
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff