lsnu commited on Mar 31

Commit

7eb3f10

verified ·

1 Parent(s): 80c771a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
environment/base_pip_freeze.txt +176 -0
environment/base_python.txt +1 -0
environment/env_list.txt +4 -0
environment/hardware_snapshot.txt +1 -0
environment/nvidia_smi.txt +22 -0
environment/reconstruct_anybimanual_overlap_replay.sh +22 -0
environment/rlbench_pip_freeze.txt +200 -0
environment/rlbench_python.txt +1 -0
environment/runtime_env_vars.sh +4 -0
environment/setup_same_hardware.sh +25 -0
environment/uname.txt +1 -0
handoff/instructions4.md +591 -0
history/VLAarchtests_previous_README.md +172 -0
metadata/source_sizes.txt +4 -0
metadata/staged_size.txt +1 -0
metadata/staged_tree_top2.txt +64 -0
third_party/AnyBimanual/agents/__init__.py +0 -0
third_party/AnyBimanual/agents/agent_factory.py +101 -0
third_party/AnyBimanual/agents/peract_bc/__init__.py +1 -0
third_party/AnyBimanual/agents/peract_bc/launch_utils.py +128 -0
third_party/AnyBimanual/agents/peract_bc/perceiver_lang_io.py +481 -0
third_party/AnyBimanual/agents/peract_bc/qattention_peract_bc_agent.py +939 -0
third_party/AnyBimanual/agents/peract_bc/qattention_stack_agent.py +132 -0
third_party/AnyBimanual/agents/peract_bc/skill_manager.py +70 -0
third_party/AnyBimanual/agents/peract_bc/trajectory_gpt2.py +775 -0
third_party/AnyBimanual/agents/peract_bc/visual_aligner.py +39 -0
third_party/AnyBimanual/agents/peract_bimanual/__init__.py +1 -0
third_party/AnyBimanual/agents/peract_bimanual/launch_utils.py +117 -0
third_party/AnyBimanual/agents/peract_bimanual/perceiver_lang_io.py +628 -0
third_party/AnyBimanual/agents/peract_bimanual/qattention_peract_bc_agent.py +1317 -0
third_party/AnyBimanual/agents/peract_bimanual/qattention_stack_agent.py +209 -0
third_party/AnyBimanual/agents/peract_bimanual/skill_manager.py +70 -0
third_party/AnyBimanual/agents/peract_bimanual/trajectory_gpt2.py +775 -0
third_party/AnyBimanual/agents/peract_bimanual/visual_aligner.py +39 -0
third_party/AnyBimanual/agents/replay_utils.py +667 -0
third_party/AnyBimanual/agents/rvt/__init__.py +6 -0
third_party/AnyBimanual/agents/rvt/launch_utils.py +221 -0
third_party/AnyBimanual/agents/rvt/rvt/config.py +54 -0
third_party/AnyBimanual/agents/rvt/rvt/configs/peract_official_config.yaml +127 -0
third_party/AnyBimanual/agents/rvt/rvt/configs/rvt.yaml +15 -0
third_party/AnyBimanual/agents/rvt/rvt/configs/rvt2.yaml +19 -0
third_party/AnyBimanual/agents/rvt/rvt/eval.py +556 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/.gitattributes +1 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/.gitignore +4 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/LICENSE +97 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/README.md +55 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/demo.png +3 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/image_0_splat_2xaa.png +0 -0
third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/point_renderer/cameras.py +119 -0

.gitattributes CHANGED Viewed

@@ -14068,3 +14068,12 @@ baselines/AnyBimanual_overlap_replay/multi/10000-14999/14031.replay filter=lfs d
 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14030.replay filter=lfs diff=lfs merge=lfs -text
 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14032.replay filter=lfs diff=lfs merge=lfs -text
 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14029.replay filter=lfs diff=lfs merge=lfs -text

 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14030.replay filter=lfs diff=lfs merge=lfs -text
 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14032.replay filter=lfs diff=lfs merge=lfs -text
 baselines/AnyBimanual_overlap_replay/multi/10000-14999/14029.replay filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/demo.png filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/RLBench/readme_files/task_grid.png filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tutorials/images/kinematics_group.png filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tutorials/images/collision_collections.png filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tests/assets/test_scene_robots.ttt filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tests/assets/test_scene_mobiles_with_arms.ttt filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tests/assets/test_scene_mobiles.ttt filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tests/assets/test_scene.ttt filter=lfs diff=lfs merge=lfs -text
+third_party/AnyBimanual/third_party/PyRep/tests/assets/cracker_box/texture_map.png filter=lfs diff=lfs merge=lfs -text

environment/base_pip_freeze.txt ADDED Viewed

	@@ -0,0 +1,176 @@

+accelerate==1.13.0
+annotated-doc==0.0.4
+antlr4-python3-runtime==4.9.3
+anyio==4.6.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blinker==1.4
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+click==8.3.1
+comm==0.2.2
+cryptography==3.4.8
+cuda-bindings==12.9.4
+cuda-pathfinder==1.2.2
+cuda-toolkit==12.8.1
+dbus-python==1.2.18
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+distro==1.7.0
+entrypoints==0.4
+execnet==2.1.2
+executing==2.1.0
+fastjsonschema==2.20.0
+filelock==3.13.1
+fqdn==1.5.1
+fsspec==2024.2.0
+h11==0.14.0
+hf-xet==1.4.2
+httpcore==1.0.5
+httplib2==0.20.2
+httpx==0.27.2
+huggingface_hub==1.8.0
+idna==3.10
+importlib-metadata==4.6.4
+iniconfig==2.3.0
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+Jinja2==3.1.3
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_client==7.4.9
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lxml==5.3.0
+markdown-it-py==4.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+nbclassic==1.1.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numpy==1.26.3
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.19.0.56
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.28.9
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+oauthlib==3.2.0
+omegaconf==2.3.0
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.2.0
+platformdirs==4.3.6
+pluggy==1.6.0
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.1
+pycparser==2.22
+Pygments==2.18.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==2.4.7
+pytest==9.0.2
+pytest-xdist==3.8.0
+python-apt==2.4.0+ubuntu4
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+PyYAML==6.0.2
+pyzmq==24.0.1
+referencing==0.35.1
+regex==2026.3.32
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==14.3.3
+rpds-py==0.20.0
+safetensors==0.7.0
+SecretStorage==3.3.1
+Send2Trash==1.8.3
+sentencepiece==0.2.1
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+sympy==1.14.0
+systemd-python==234
+terminado==0.18.1
+tinycss2==1.3.0
+tokenizers==0.22.2
+torch==2.11.0+cu128
+torchaudio==2.11.0+cu128
+torchvision==0.26.0+cu128
+tornado==6.4.1
+tqdm==4.67.3
+traitlets==5.14.3
+transformers==5.4.0
+triton==3.6.0
+typer==0.24.1
+types-python-dateutil==2.9.0.20240906
+typing_extensions==4.15.0
+uri-template==1.3.0
+urllib3==2.2.3
+wadllib==1.3.6
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+zipp==1.0.0

environment/base_python.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Python 3.11.10

environment/env_list.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+  Name     Active  Path
+────────────────────────────────────────────
+  base     *       /workspace
+  rlbench          /workspace/envs/rlbench

environment/hardware_snapshot.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ NVIDIA RTX PRO 6000 Blackwell Server Edition, 580.126.09, 97887 MiB

environment/nvidia_smi.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+Mon Mar 30 14:48:02 2026
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.126.09             Driver Version: 580.126.09     CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA RTX PRO 6000 Blac...    On  |   00000000:F4:00.0 Off |                    0 |
+| N/A   34C    P0             83W /  600W |    6110MiB /  97887MiB |      0%      Default |
+|                                         |                        |             Disabled |
++-----------------------------------------+------------------------+----------------------+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|    0   N/A  N/A          181865      G   /usr/lib/xorg/Xorg                       97MiB |
+|    0   N/A  N/A          278028      C   python                                  570MiB |
+|    0   N/A  N/A          278251    C+G   ...space/envs/rlbench/bin/python       5401MiB |
++-----------------------------------------------------------------------------------------+

environment/reconstruct_anybimanual_overlap_replay.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env bash
+set -euo pipefail
+if [ "$#" -ne 2 ]; then
+  echo "usage: $0 <sharded_multi_dir> <flat_output_dir>" >&2
+  echo "example: $0 baselines/AnyBimanual_overlap_replay/multi /tmp/multi_flat" >&2
+  exit 1
+fi
+src="$1"
+dst="$2"
+mkdir -p "$dst"
+find "$src" -mindepth 2 -maxdepth 2 -type f -name '*.replay' -print0 \
+  | xargs -0 -n 1 -P 32 bash -c '
+      f="$1"
+      out="$2/$(basename "$f")"
+      ln "$f" "$out"
+    ' _ '{}' "$dst"
+echo "reconstructed flat replay directory at: $dst"

environment/rlbench_pip_freeze.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+absl-py==2.1.0
+accelerate==0.31.0
+addict==2.4.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.4
+aiosignal==1.4.0
+antlr4-python3-runtime==4.9.3
+asttokens==3.0.1
+async-timeout==5.0.1
+attrs==26.1.0
+backports.zstd @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_backports.zstd_1767044984/work
+blinker==1.9.0
+blosc==1.11.4
+Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1764016952863/work
+cached-property @ file:///home/conda/feedstock_root/build_artifacts/cached_property_1615209429212/work
+certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1772001073725/work/certifi
+cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1761202865726/work
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1773659966602/work
+click==8.3.1
+click-prompt==0.5.1
+clip @ git+https://github.com/openai/CLIP.git@d05afc436d78f1c48dc0dbf8e5980a9d471f35f6
+cloudpickle==3.1.2
+comm==0.2.3
+ConfigArgParse==1.7.5
+contourpy @ file:///home/conda/feedstock_root/build_artifacts/contourpy_1744743067588/work
+cuda-bindings==12.9.4
+cuda-pathfinder==1.2.2
+cuda-toolkit==12.8.1
+cycler @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_cycler_1764466758/work
+dash==4.1.0
+decorator==5.2.1
+docker-pycreds==0.4.0
+einops==0.8.0
+exceptiongroup==1.3.1
+executing==2.2.1
+Farama-Notifications==0.0.4
+fastjsonschema==2.21.2
+filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1773313889543/work
+Flask==3.1.3
+fonttools @ file:///home/conda/feedstock_root/build_artifacts/fonttools_1773137064424/work
+freetype-py==2.5.1
+frozenlist==1.8.0
+fsspec==2026.3.0
+ftfy==6.2.0
+gitdb==4.0.12
+GitPython==3.1.46
+gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1773244929835/work
+grpcio==1.78.0
+gym==0.26.2
+gym-notices==0.1.0
+gymnasium==1.0.0a2
+h2 @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_h2_1756364871/work
+h5py @ file:///home/conda/feedstock_root/build_artifacts/h5py_1774712049671/work
+hf-xet==1.4.2
+hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1737618293087/work
+huggingface_hub==0.36.2
+hydra-core==1.3.2
+hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1737618333194/work
+idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1760286409563/work
+imageio @ file:///home/conda/feedstock_root/build_artifacts/imageio_1738273805233/work
+imgaug==0.4.0
+importlib_metadata==9.0.0
+ipython==8.39.0
+ipywidgets==8.1.8
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_jinja2_1764517220/work
+joblib==1.5.3
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+jupyter_core==5.9.1
+jupyterlab_widgets==3.0.16
+kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_kiwisolver_1773067043/work
+lazy-loader==0.5
+Markdown==3.10.2
+markdown-it-py==4.0.0
+MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1772444934960/work
+matplotlib @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-suite_1715976200404/work
+matplotlib-inline==0.2.1
+mdurl==0.1.2
+meshcat==0.3.2
+moviepy==2.2.1
+mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1773661943568/work
+multidict==6.7.1
+munkres==1.1.4
+narwhals==2.18.1
+natsort==8.4.0
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_networkx_1731521053/work
+numpy==1.26.4
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.19.0.56
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.28.9
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+omegaconf==2.3.0
+open3d==0.19.0
+openai==0.28.1
+opencv-python==4.10.0.84
+packaging @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_packaging_1769093650/work
+pandas @ file:///home/conda/feedstock_root/build_artifacts/pandas_1744430447393/work
+parso==0.8.6
+-e git+https://github.com/markusgrotz/peract_bimanual.git@bb0232a6ba3fe116566e9568f0c7af980ed6703d#egg=peract_bimanual
+perceiver-pytorch==0.8.8
+pexpect==4.9.0
+pillow==12.1.1
+platformdirs==4.9.4
+plotly==6.6.0
+ply @ file:///home/conda/feedstock_root/build_artifacts/ply_1733239724146/work
+poetry-core==2.3.2
+prompt_toolkit==3.0.52
+propcache==0.4.1
+protobuf==5.29.6
+psutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_psutil_1769678154/work
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pycparser_1733195786/work
+pyglet==2.1.13
+Pygments==2.20.0
+pyngrok==7.5.1
+PyOpenGL==3.1.0
+pyparsing @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pyparsing_1769003998/work
+PyQt5==5.15.11
+PyQt5_sip==12.17.0
+pyquaternion==0.9.9
+pyrender==0.1.45
+-e git+https://github.com/markusgrotz/PyRep.git@b8bd1d7a3182adcd570d001649c0849047ebf197#egg=PyRep
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1733217236728/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dateutil_1751104122/work
+pytorch-lamb==1.0.0
+pytz @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pytz_1773679724/work
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1770223234623/work
+pyzmq==27.1.0
+referencing==0.37.0
+regex==2024.5.15
+requests @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_requests_1774462091/work
+retrying==1.4.2
+# Editable install with no version control (reveal-vla-bimanual==0.1.0)
+-e /workspace/reveal_vla_bimanual
+rich==13.9.4
+rich-click==1.8.9
+-e git+https://github.com/markusgrotz/RLBench.git@8af748c51287989294e00c9c670e3330a0e35ed5#egg=rlbench
+rpds-py==0.30.0
+safetensors==0.4.3
+scikit-image==0.25.2
+scikit-learn==1.7.2
+scipy @ file:///home/conda/feedstock_root/build_artifacts/scipy-split_1716470219380/work/dist/scipy-1.13.1-cp310-cp310-linux_x86_64.whl#sha256=a4ff22b6dc27b61196be51695f53f9b0676e7c1bc564872b51fc3c41b79ae80b
+segment-anything==1.0
+sentry-sdk==2.56.0
+setproctitle==1.3.7
+shapely==2.1.2
+sip @ file:///home/conda/feedstock_root/build_artifacts/sip_1759437834046/work
+six @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_six_1753199211/work
+smmap==5.0.3
+stack-data==0.6.3
+sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1771952240620/work
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.4
+termcolor==3.3.0
+threadpoolctl==3.6.0
+tifffile==2025.5.10
+timeout-decorator==0.5.0
+timm==1.0.26
+tokenizers==0.19.1
+toml @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_toml_1764486833/work
+tomli @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_tomli_1774492402/work
+torch==2.11.0+cu128
+torchaudio==2.11.0+cu128
+torchvision==0.26.0+cu128
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1774357896577/work
+tqdm @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_tqdm_1770153424/work
+traitlets==5.14.3
+transformers==4.41.2
+transforms3d==0.4.1
+trimesh @ file:///home/conda/feedstock_root/build_artifacts/trimesh_1774412449209/work
+triton==3.6.0
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_typing_extensions_1756220668/work
+tzdata @ file:///home/conda/feedstock_root/build_artifacts/python-tzdata_1765719872007/work
+u-msgpack-python==2.8.0
+unicodedata2 @ file:///home/conda/feedstock_root/build_artifacts/unicodedata2_1770908960326/work
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1767817748113/work
+wandb==0.18.0
+wcwidth==0.2.14
+Werkzeug==3.1.7
+widgetsnbextension==4.0.15
+yarl==1.23.0
+-e git+https://github.com/markusgrotz/YARR.git@6822ff78602c77878b27d4cfe759ce029c67bffb#egg=yarr
+zipp==3.23.0

environment/rlbench_python.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Python 3.10.20

environment/runtime_env_vars.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+export HF_HOME=/workspace/.cache/huggingface
+export MAMBA_ROOT_PREFIX=/workspace/.micromamba
+export DISPLAY=:99
+export PYTHONPATH=/workspace/VLAarchtests/code/reveal_vla_bimanual

environment/setup_same_hardware.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="${ROOT_DIR:-/workspace}"
+echo "[setup] expected hardware: NVIDIA RTX PRO 6000 Blackwell Server Edition"
+echo "[setup] expected OS family: Ubuntu 24.04 / Linux 6.8"
+if [ ! -d "$ROOT_DIR/VLAarchtests" ]; then
+  echo "[setup] expected staged tree at $ROOT_DIR/VLAarchtests" >&2
+fi
+echo "[setup] base python:"
+python --version || true
+if [ -x "$ROOT_DIR/.tools/micromamba/bin/micromamba" ]; then
+  echo "[setup] micromamba envs:"
+  "$ROOT_DIR/.tools/micromamba/bin/micromamba" env list || true
+fi
+echo "[setup] current GPU snapshot:"
+nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || true
+echo "[setup] this repo includes package snapshots in environment/"
+echo "[setup] recreate the rlbench env at $ROOT_DIR/envs/rlbench before running the overlap baseline path"

environment/uname.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Linux 129a2ec0f4a9 6.8.0-106-generic #106-Ubuntu SMP PREEMPT_DYNAMIC Fri Mar 6 07:58:08 UTC 2026 x86_64 x86_64 x86_64 GNU/Linux

handoff/instructions4.md ADDED Viewed

	@@ -0,0 +1,591 @@

+# Developer handoff: 10-hour sim sprint on 1×RTX PRO 6000 for elastic-occlusion bimanual reveal/retrieve
+## Scope
+This handoff is for the current simulation phase only. The purpose is not to produce publication-grade evidence. The purpose is to use one short sprint to extract the most decision-relevant signal possible before the custom three-task teleoperation benchmark exists.
+This document does **not** include explicit instructions for future teleoperation data collection.
+The target problem remains the same: bimanual reveal and retrieve under elastic occlusion, with task families that map to (1) foliage reveal with safe actor insertion and retrieval, (2) bag opening plus retrieval, and (3) folded-cloth / suitcase reveal with minimal fold disruption.
+## Hard constraints for this sprint
+The sprint must assume the following:
+- Hardware: **1× RTX PRO 6000** workstation GPU.
+- Wall-clock budget: **~10 hours total**.
+- Deliverable standard: **decision-quality results**, not paper-quality results. They must be rigorous nonetheless - absolutely no data leaks.
+- The output of the sprint must be enough to shape the next development cycle.
+This means the sprint is not allowed to expand into a broad refactor, a large hyperparameter search, an external benchmark integration project, or a foundation-model migration. The objective is to get the strongest signal per hour.
+## What this sprint needs to answer
+At the end of the 10-hour window, the repo should let us answer these questions with reasonable confidence:
+1. Does the current full architecture look directionally better than trivial and structured baselines on rough proxy versions of the three real task families?
+2. Which parts of the architecture appear to matter most right now: explicit task conditioning, geometry, memory, planner, or candidate family structure?
+3. For each of the three target tasks, is the current architecture best described as **promising**, **uncertain**, or **weak** under the proxy tests?
+4. If the system is weak, is the weakness coming mostly from perception/state estimation, memory, retrieve gating/planning, or proxy mismatch?
+5. Which next engineering changes are most likely to widen the eventual real-task performance gap, and which ones are unlikely to matter?
+That is the bar for success in this sprint. It is acceptable if the results are approximate. It is not acceptable if the results are too noisy or too poorly instrumented to support those decisions.
+## What this sprint can and cannot tell us
+This sprint **can** tell us whether the current structured architecture is showing the right dependencies under stress. For example, it can tell us whether memory actually helps under reocclusion, whether geometry actually helps under camera perturbation, whether retrieve gating blocks premature retrieve, and whether the planner adds value beyond trivial candidate selection.
+This sprint **cannot** tell us true real-world performance on live foliage, real bag interiors, or folded clothes in a suitcase. It also cannot tell us whether the final production backbone should be CLIP, OpenVLA-style, LingBot-style, or something else. Those are later decisions.
+So the correct target is not “exact future performance.” The correct target is “a useful approximation of whether the current structure is pointed in the right direction, and where the biggest current bottlenecks are.”
+## High-level decisions to lock now
+1. Treat the current **compact-phase CLIP/RGB-D handoff branch** as the only reference branch for this sprint.
+2. Keep the explicit reveal-state stack. Do **not** rewrite the repo into a monolithic end-to-end VLA policy in this sprint.
+3. Keep RLBench only as a smoke test for three-camera and bimanual integration. Do **not** use RLBench mean success as the main selector for reveal/retrieve architecture changes.
+4. Do **not** switch to a new backbone in this sprint. The goal here is to evaluate the structure, not to spend the 10-hour budget on trunk migration.
+5. Prefer **eval-time knockouts and toggles** over fully retrained matched ablations in this sprint. Retrained matched ablations remain important later, but they are too expensive for the current time budget.
+## Immediate read of the current repo
+The current repo is still a good scaffold, but the signal quality is weaker than the structure quality. The strongest part of the system remains the decomposition itself: explicit reveal-state fields, dual scene/belief memory, task-conditioned proposal families, and a planner that reasons about persistence, reocclusion, support, and actor feasibility. The weakest part is that the current evidence is still too easy to misread. The proxy results already suggest that the large spatial rollout path is not the right place to spend another iteration right now, while the compact-phase line remains the most credible base. That should be treated as settled for this sprint.
+## Sprint strategy
+The fastest path to useful conclusions is:
+1. Remove the most obvious confounds.
+2. Add just enough stress slicing and logging to make the proxy benchmark informative.
+3. Run a **small but fixed stratified benchmark** that is reused for every comparison.
+4. Compare the base model against trivial baselines and a few **eval-time architecture knockouts**.
+5. Use the results to build a task-by-task bottleneck map.
+This is deliberately narrower than the earlier broad handoff. The point is to obtain meaningful conclusions within one GPU and one workday.
+## Do now vs defer
+### Must do in this sprint
+These are the changes that are worth the time even under a 10-hour cap.
+1. Explicit task metadata must override text routing.
+2. History camera geometry must propagate correctly.
+3. The compact-phase branch must become the default base config.
+4. The proxy benchmark must become stratified by task and stress slice.
+5. The benchmark runner must support simple baselines and eval-time architecture knockouts.
+6. Reporting must include the small set of task-specific metrics that actually matter for the three target tasks.
+### Explicitly defer until after this sprint
+These are good ideas, but they should not consume the current 10-hour budget.
+1. Foundation trunk migration (OpenVLA, LingBot, π0.5, etc.).
+2. External deformable benchmark integration.
+3. Full matched retraining ablation suite.
+4. Major loss redesign or long retraining campaigns.
+5. Large nuisance sweeps beyond the narrow stress slices listed below.
+6. Spatial rollout branch rescue work.
+## Mandatory repo changes for this sprint
+### 1. Replace heuristic task routing as the primary path
+**Why now:** this is a real confound and cheap to fix.
+Files to edit:
+- `code/reveal_vla_bimanual/models/policy.py`
+- `code/reveal_vla_bimanual/models/action_decoder.py`
+- `code/reveal_vla_bimanual/sim_reveal/dataset.py`
+- `code/reveal_vla_bimanual/sim_reveal/generate_dataset.py`
+- `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
+Required changes:
+- Add `task_name` and `task_id` to every proxy training and evaluation example.
+- Make explicit task metadata override any text-based inference everywhere.
+- Keep keyword routing only as a fallback for legacy examples that do not carry task metadata.
+- Surface the resolved task family in benchmark logs so mistakes are easy to see.
+Acceptance criterion:
+- A misleading prompt string must not change the task family when `task_name` is present.
+### 2. Fix history geometry propagation
+**Why now:** if geometry is broken in history, the current geometry ablations are not trustworthy.
+Files to edit:
+- `code/reveal_vla_bimanual/models/policy.py`
+- any history batching utility that currently drops camera matrices
+- proxy dataset serialization if history camera metadata is missing
+Required changes:
+- Save and batch history camera intrinsics and extrinsics.
+- Pass them through the history encoder when geometry and camera-pose tokens are enabled.
+- Add a validity mask if some history frames do not have full camera metadata.
+- Add a debug log or assertion path that makes it obvious whether history geometry is really being used.
+Acceptance criterion:
+- A geometry-enabled run must receive non-null history camera tensors in the forward path.
+### 3. Freeze the compact-phase branch as the main base
+**Why now:** the time budget does not allow another architecture round on the weaker spatial branch.
+Files to edit:
+- `code/reveal_vla_bimanual/train/configs/*.yaml`
+- `code/reveal_vla_bimanual/eval/run_ablations.py`
+- any training or eval launcher that still points to older dummy ablation configs
+Required changes:
+- Create one new base config derived from the compact-phase recipe. Suggested filename:
+  - `proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase_v7_base.yaml`
+- Mark the current spatial configs as experimental.
+- Make the new v7 base the default for this sprint.
+- Do not create a broad new family of retrained ablation configs now. That is for later.
+Acceptance criterion:
+- The benchmark runner should use the v7 compact-phase config by default unless explicitly told otherwise.
+### 4. Add a narrow but informative proxy stress suite
+**Why now:** the current proxies are too undifferentiated. The sprint only needs enough stress structure to make results interpretable.
+Files to edit:
+- `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
+- `code/reveal_vla_bimanual/sim_reveal/proxy_specs.py`
+- `code/reveal_vla_bimanual/sim_reveal/dataset.py`
+- `code/reveal_vla_bimanual/sim_reveal/generate_dataset.py`
+Required changes:
+Add only these stress slices for this sprint:
+- `nominal`
+- `high_reocclusion`
+- `camera_perturbation`
+- one task-specific critical slice per task:
+  - foliage: `tight_corridor_high_collateral`
+  - bag: `one_sided_slip`
+  - cloth: `fold_sensitive_long_persistence`
+Also add:
+- `difficulty_bin` with only `medium` and `hard` for this sprint (skip easy and extreme to save time and focus on decision-relevant cases)
+- episode metadata for the sampled nuisance parameters used in each stress slice
+- per-step traces for visibility, support, access/corridor, reocclusion risk, disturbance, and chosen candidate family
+Acceptance criterion:
+- Every benchmark report must be sliceable by task family, stress slice, and difficulty bin.
+### 5. Add simple baselines and oracle-style planner evaluation
+**Why now:** without these, base-model numbers are hard to interpret.
+Files to edit/add:
+- `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
+- `code/reveal_vla_bimanual/eval/metrics.py`
+- add `code/reveal_vla_bimanual/eval/run_proxy_random_eval.py`
+- add `code/reveal_vla_bimanual/eval/run_proxy_candidate0_eval.py`
+- add `code/reveal_vla_bimanual/eval/run_planner_oracle_eval.py`
+- add `code/reveal_vla_bimanual/eval/run_proxy_scripted_eval.py` if the existing scripted path is not already callable directly
+Required changes:
+- Add random candidate selection.
+- Add candidate-0 selection.
+- Add scripted teacher execution.
+- Add oracle-planner evaluation that uses proxy candidate summaries directly.
+- Add support for eval-time architecture toggles:
+  - `--disable_planner`
+  - `--disable_memory`
+  - `--disable_task_conditioning`
+  - `--disable_geometry`
+  - `--disable_camera_pose` (optional if it is cheap)
+Acceptance criterion:
+- The benchmark must be able to compare the same checkpoint against trivial baselines and against architecture knockouts without retraining all variants.
+### 6. Strengthen reporting, but only where it matters
+**Why now:** the current sprint only succeeds if it produces conclusions, not just numbers.
+Files to edit:
+- `code/reveal_vla_bimanual/eval/metrics.py`
+- `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
+Required metric outputs:
+Global:
+- overall proxy success
+- per-task success
+- success by stress slice
+- success by difficulty bin
+- premature retrieve rate
+- reocclusion-after-reveal rate
+- planner regret (where oracle summaries are available)
+Task-specific headline metrics:
+Foliage:
+- visibility integral
+- corridor availability
+- collateral motion / damage proxy
+- actor-feasibility floor before retrieve
+Bag:
+- mouth aperture
+- hold persistence
+- rim slip rate
+- insertable corridor
+Cloth:
+- fold preservation
+- layer separation quality
+- top-layer stability
+- lift-too-high rate
+Required report shape:
+- one overall table
+- one task × stress slice table
+- per-episode JSON traces for failure clustering later
+Acceptance criterion:
+- A single report should make it obvious whether the model is failing because of reocclusion, geometry sensitivity, premature retrieve, or task-specific degradation.
+## Changes that are useful only if they are cheap
+These are allowed only if the mandatory work finishes early.
+### A. Light training rebalance
+Only do this if it can be implemented in under about one hour.
+Allowed small changes:
+- oversample obvious hard negatives already present in the proxy dataset
+- slightly increase loss weight on unsafe-retrieve ranking errors
+- log candidate ranking diagnostics during training
+Do **not** do a broad loss redesign in this sprint.
+### B. Trunk modularity prep
+Only do this if the mandatory work is already complete.
+Allowed small changes:
+- define a simple trunk adapter interface around the current CLIP path
+- avoid touching planner, memory, or reveal-head code
+Do **not** attempt a real new-backbone integration in this sprint.
+## 10-hour execution plan
+This schedule is the intended wall-clock plan. It is aggressive but realistic if the scope stays narrow.
+### Hour 0 to 1.5: remove confounds
+Complete:
+- explicit task metadata path
+- history geometry propagation
+- v7 compact-phase base config
+- eval-time toggle plumbing in the benchmark runner
+Output expected by the end of this block:
+- code compiles
+- a tiny smoke run confirms task routing and history geometry are active
+### Hour 1.5 to 3: build the fixed eval set and reporting
+Complete:
+- stratified proxy eval set generation
+- task/stress/difficulty metadata roundtrip
+- benchmark tables and per-episode JSON traces
+- random, candidate-0, scripted, and oracle evaluation entry points
+Output expected by the end of this block:
+- one fixed benchmark episode set reused by every later run
+### Hour 3 to 5.5: produce one base-model result
+Preferred path:
+- reuse the best existing compact-phase checkpoint if it still loads cleanly after the metadata and geometry fixes
+- if needed, run a short warm-start fine-tune from that checkpoint rather than starting from scratch
+Do not spend this block on multi-seed training. One strong base run is more valuable than several weak incomplete runs.
+Output expected by the end of this block:
+- one evaluated base model on the full fixed proxy suite
+### Hour 5.5 to 8.5: run baselines and eval-time knockouts
+Required comparisons:
+- random
+- candidate-0
+- scripted teacher
+- oracle planner
+- base model
+- base model with planner disabled
+- base model with memory disabled
+- base model with task conditioning disabled
+- base model with geometry disabled
+- base model with camera pose disabled if cheap enough
+Output expected by the end of this block:
+- a complete comparison table on the same episodes
+### Hour 8.5 to 10: summarize and extract conclusions
+Complete:
+- task-by-task bottleneck summary
+- approximate transfer-readiness labels for foliage, bag, and cloth
+- ranked next-step engineering priorities
+Output expected by the end of this block:
+- one benchmark summary table
+- one short conclusion memo in the repo or artifact directory
+## Fixed benchmark design for the 10-hour sprint
+Use one small, fixed, stratified benchmark. Reuse the same episode seeds for all variants.
+Recommended size:
+- **300 total episodes**
+- **100 per task family**
+- per task family:
+  - 20 `nominal` / `medium`
+  - 20 `nominal` / `hard`
+  - 20 `high_reocclusion`
+  - 20 `camera_perturbation`
+  - 20 task-specific critical slice
+This is small enough to run repeatedly and large enough to show directional differences if the logging is good.
+## Required tests to add now
+The test suite in this sprint is not meant to be exhaustive. It is meant to prevent false conclusions.
+### Unit tests
+`tests/test_explicit_task_metadata_overrides_text.py`
+- Batch has `task_name="bag"` and misleading foliage text.
+- Assert that bag proposal families and bag task heads are used.
+`tests/test_text_routing_only_used_as_fallback.py`
+- Assert that keyword routing is skipped when task metadata exists.
+`tests/test_history_camera_geometry_propagates.py`
+- Assert that history frames receive non-null camera tensors when geometry is enabled.
+`tests/test_history_geometry_validity_mask.py`
+- Assert that mixed valid/invalid history geometry uses a validity mask rather than silent nulling.
+`tests/test_eval_toggle_paths_work.py`
+- Assert that planner, memory, task-conditioning, and geometry toggles actually change the execution path.
+`tests/test_benchmark_report_contains_task_and_stress_slices.py`
+- Assert that the output report includes task family, stress slice, and difficulty bin tables.
+### Integration tests
+`tests/test_proxy_stress_profile_metadata_roundtrip.py`
+- Generate all sprint stress slices and assert the metadata survives dataset serialization and evaluation.
+`tests/test_planner_beats_random_on_oracle_candidates.py`
+- Use oracle candidate summaries and assert the planner beats random and candidate-0 on regret/top-1.
+`tests/test_memory_matters_under_high_reocclusion.py`
+- Compare full memory vs disabled-memory on a small high-reocclusion slice and assert a directional drop.
+`tests/test_geometry_matters_under_camera_perturbation.py`
+- Compare geometry-on vs geometry-off on a small camera-perturbation slice and assert a directional drop.
+`tests/test_retrieve_gating_blocks_premature_retrieve.py`
+- Feed candidates where raw retrieve looks tempting but support/persistence/corridor are unsafe and assert that retrieve is rejected.
+These tests are enough for this sprint. Do not expand the suite unless the mandatory implementation is already done.
+## Benchmark runs to perform in this sprint
+### 1. Baselines
+Run on the fixed 300-episode benchmark:
+- random candidate selection
+- candidate-0 selection
+- scripted teacher
+- oracle planner
+These establish the floor, the structured upper bound for the current proposal set, and whether the learned planner adds anything.
+### 2. Base model run
+Run the v7 compact-phase base on the same 300 episodes.
+### 3. Eval-time architecture knockouts
+Run the same checkpoint with these toggles:
+- no planner
+- no memory
+- no task conditioning
+- no geometry
+- no camera pose (only if cheap)
+These are not final scientific ablations. They are fast directional probes that tell us what appears to matter right now.
+### 4. Optional cheap extra run
+Only if time remains:
+- a short warm-start fine-tune with a small hard-negative rebalance
+Treat this only as bonus signal. It is not part of the minimum sprint success condition.
+## Decision rubric for interpreting the results
+At the end of the sprint, do **not** only report the raw numbers. Convert the results into a bottleneck map.
+### Signals that the architecture is directionally healthy
+- Base clearly beats random and candidate-0 on all three task families.
+- Oracle planner beats random and candidate-0 by a wide margin, showing the proposal/planner structure is at least usable.
+- Disabling planner hurts most on premature-retrieve and task-specific stress slices.
+- Disabling memory hurts most on `high_reocclusion` and long-persistence cloth cases.
+- Disabling geometry hurts most on `camera_perturbation`.
+- Task conditioning matters on the mixed-task benchmark and under misleading text.
+### Signals that a component is currently unproven
+- Base is only slightly above candidate-0 or random.
+- Oracle planner is weak, which means the proposal set or planner utility is not yet reliable.
+- Memory removal is almost flat on `high_reocclusion`.
+- Geometry removal is almost flat on `camera_perturbation`.
+- Planner removal is flat, which suggests that the learned scores or the candidate shortlist are not carrying useful structure.
+### Task-specific interpretation
+For foliage, judge transfer-readiness mainly from:
+- corridor availability
+- actor-feasibility floor before retrieve
+- collateral motion / damage proxy
+- robustness under `high_reocclusion` and `tight_corridor_high_collateral`
+For bag, judge transfer-readiness mainly from:
+- mouth aperture
+- hold persistence
+- rim slip rate
+- robustness under `one_sided_slip`
+For cloth, judge transfer-readiness mainly from:
+- fold preservation
+- layer separation quality
+- top-layer stability
+- lift-too-high rate
+- robustness under `fold_sensitive_long_persistence`
+At the end of the sprint, label each task family as:
+- **Promising**: base beats weak baselines clearly and the expected architectural components matter on the right stress slices.
+- **Uncertain**: base is somewhat above baselines but at least one critical stress slice or component dependency is weak.
+- **Weak**: base is near trivial baselines or the critical stress slices fail badly enough that the current structure is not yet convincing.
+## Practical GPU/runtime guidance
+This sprint assumes a single 96 GB workstation GPU. That is enough for the current CLIP-based compact-phase line and repeated proxy evaluations, but it is not enough time to justify broad parallel experiments.
+Use the following operating rules:
+- run experiments sequentially
+- prefer one strong base run over many partial runs
+- use mixed precision if already supported
+- keep evaluation batch sizes modest and stable
+- avoid large retraining loops or many seeds
+- reuse the same fixed benchmark episodes for every comparison
+The main bottleneck in this sprint should be engineering and interpretation, not raw VRAM.
+## Things not to do in this sprint
+Do not switch backbones. Do not integrate an external simulator. Do not rescue the spatial rollout branch. Do not run broad hyperparameter sweeps. Do not attempt a five-seed retraining ablation matrix. Do not use RLBench averages as the main argument for or against architecture changes meant for foliage, bag, or folded cloth.
+## Minimal deliverables at the 10-hour mark
+At the end of the sprint, the repo or artifact directory should contain:
+1. the new `v7` compact-phase base config
+2. the fixed 300-episode stratified benchmark definition or metadata file
+3. updated benchmark runner with stress-slice reporting
+4. random, candidate-0, scripted, and oracle evaluation runners
+5. the required unit and integration tests listed above
+6. one complete result table comparing:
+   - random
+   - candidate-0
+   - scripted teacher
+   - oracle planner
+   - base model
+   - no planner
+   - no memory
+   - no task conditioning
+   - no geometry
+   - optional no camera pose
+7. one short decision memo that states:
+   - approximate transfer-readiness for foliage, bag, and cloth
+   - which architectural components look most important
+   - which current weakness is most likely to block real-task success
+   - what should be strengthened next
+## Success condition for this sprint
+This sprint is successful if, by the end of 10 hours, we can say something like the following and defend it with benchmark evidence:
+- “The structured architecture appears meaningfully better than trivial baselines on foliage and bag proxies, but cloth remains fragile because fold preservation degrades under long persistence.”
+- or “The planner structure looks sound under oracle candidates, but the learned state estimate is still too weak, so the next work should target perception/memory rather than planner redesign.”
+- or “Geometry and task conditioning matter, but memory does not yet move the reocclusion slice, so the current memory story is still unproven.”
+If we can make claims of that form with actual run outputs, the sprint has done its job.

history/VLAarchtests_previous_README.md ADDED Viewed

	@@ -0,0 +1,172 @@

+---
+tags:
+  - robotics
+  - vision-language-action
+  - bimanual-manipulation
+  - rlbench
+  - rgbd
+---
+# VLAarchtests
+Bundle uploaded from `/workspace` runpod sessions dated `2026-03-25 UTC` and `2026-03-26 UTC`.
+## Top-Level Contents
+- `code/reveal_vla_bimanual/`
+  - project code used for the proxy and RLBench runs in this bundle
+- `artifacts/data/reveal_proxy/`
+  - proxy dataset bundles used by the handoff runs
+- `artifacts/outputs/r3d/`
+  - previously uploaded R3D proxy outputs already present in the bundle
+- `artifacts/outputs/r3d_handoff/`
+  - handoff proxy checkpoints
+- `artifacts/outputs/r3d_handoff_phase/`
+  - phase-supervised handoff proxy checkpoints
+- `artifacts/outputs/rlbench_current/`
+  - RLBench checkpoints from the current session
+- `artifacts/reports/`
+  - proxy and RLBench result files copied from `/workspace/reports`
+- `environment/`
+  - same-machine setup files and validation helpers
+- `tests/`
+  - local test suite
+- `handoff/instructions.md`
+  - instruction file used for the handoff work
+- `MODEL_INDEX.md`
+  - checkpoint and result index
+- `results/session_results_20260326.md`
+  - raw result tables for the `2026-03-25/26` work
+## Code Added Or Updated
+### Core model, memory, planner, and dataset paths
+- `code/reveal_vla_bimanual/models/backbones.py`
+- `code/reveal_vla_bimanual/models/multiview_fusion.py`
+- `code/reveal_vla_bimanual/models/observation_memory.py`
+- `code/reveal_vla_bimanual/models/reveal_head.py`
+- `code/reveal_vla_bimanual/models/world_model.py`
+- `code/reveal_vla_bimanual/models/action_decoder.py`
+- `code/reveal_vla_bimanual/models/planner.py`
+- `code/reveal_vla_bimanual/models/policy.py`
+- `code/reveal_vla_bimanual/train/losses.py`
+- `code/reveal_vla_bimanual/sim_reveal/dataset.py`
+- `code/reveal_vla_bimanual/sim_reveal/procedural_envs.py`
+- `code/reveal_vla_bimanual/sim_rlbench/dataset.py`
+### Training and evaluation paths
+- `code/reveal_vla_bimanual/train/run_rlbench_experiment.py`
+- `code/reveal_vla_bimanual/eval/run_reveal_benchmark.py`
+- `code/reveal_vla_bimanual/eval/run_ablations.py`
+- `code/reveal_vla_bimanual/eval/run_teacher_audit.py`
+- `code/reveal_vla_bimanual/eval/run_rlbench_rollout_eval.py`
+- `code/reveal_vla_bimanual/eval/run_rlbench_knn_eval.py`
+### Added or updated training configs
+- `code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact.yaml`
+- `code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial.yaml`
+- `code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_compact_phase.yaml`
+- `code/reveal_vla_bimanual/train/configs/proxy_interaction_r3d_stage3_clip_rgbd_handoff_spatial_phase.yaml`
+- `code/reveal_vla_bimanual/train/configs/rlbench_subset3_backbone_only_clip_current_valid9.yaml`
+- `code/reveal_vla_bimanual/train/configs/rlbench_subset3_backbone_only_clip_current_common23.yaml`
+- `code/reveal_vla_bimanual/train/configs/rlbench_lift_ball_backbone_only_clip_current_wide.yaml`
+- `code/reveal_vla_bimanual/train/configs/rlbench_lift_ball_backbone_only_clip_step1.yaml`
+- `code/reveal_vla_bimanual/train/configs/rlbench_push_box_backbone_only_clip_step1.yaml`
+### Test files
+The staged `tests/` directory contains `32` test modules plus `conftest.py`, including:
+- geometry and camera rotation coverage
+- phase-label and candidate-ranking coverage
+- planner gradient-flow and reocclusion gating coverage
+- world-model null-rollout, field-consistency, and task-adapter coverage
+- proxy scripted benchmark and teacher-audit coverage
+## Verification
+- local test command:
+  - `PYTHONPATH=/workspace/VLAarchtests_work/code/reveal_vla_bimanual python -m pytest -q /workspace/VLAarchtests_work/tests`
+- result:
+  - `33 passed`
+## Raw Result Files
+### Proxy and handoff results
+- `artifacts/reports/reveal_smoke_mod/reveal_benchmark.json`
+- `artifacts/reports/reveal_smoke_nogeom/reveal_benchmark.json`
+- `artifacts/reports/reveal_smoke_noplanner/reveal_benchmark.json`
+- `artifacts/reports/reveal_handoff_compare_serious/reveal_benchmark.json`
+- `artifacts/reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json`
+- `artifacts/reports/reveal_phase_compare_serious_compact/reveal_benchmark.json`
+- `artifacts/reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json`
+- `artifacts/reports/reveal_phase_ablations_compact/ablations.json`
+- `artifacts/reports/reveal_teacher_audit_serious/teacher_audit.json`
+### RLBench result files
+- `artifacts/reports/rlbench_dual_buttons_baseline_len100_ep1_ik_rescale/rollout_eval.json`
+- `artifacts/reports/rlbench_dual_buttons_common23_len100_ep1_ik_rescale/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_common23_len100_ep1_ik_rescale/rollout_eval.json`
+- `artifacts/reports/rlbench_lift_ball_wide_len160_ep1_ik_c1/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_step1_ep1_ik_c1/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_step1_ep1_ik_c1_s005/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_knn_step1_ep1/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_knn_step1_ep5/rollout_eval.json`
+- `artifacts/reports/rlbench_push_box_knn_step1_ep5_top1_dense/rollout_eval.json`
+## Raw Result Tables
+### Proxy serious runs
+| Artifact | File | Raw values |
+| --- | --- | --- |
+| spatial handoff vs released baseline | `artifacts/reports/reveal_handoff_compare_serious/reveal_benchmark.json` | baseline mean success `0.5833`, handoff mean success `0.2167` |
+| spatial-trained checkpoint with compact world model vs released baseline | `artifacts/reports/reveal_handoff_compare_serious_compact/reveal_benchmark.json` | baseline mean success `0.5833`, handoff mean success `0.5200` |
+| compact-phase vs released baseline | `artifacts/reports/reveal_phase_compare_serious_compact/reveal_benchmark.json` | baseline mean success `0.5833`, compact-phase mean success `0.5133` |
+| spatial-phase with compact world model vs released baseline | `artifacts/reports/reveal_phase_compare_serious_spatial_compactwm/reveal_benchmark.json` | baseline mean success `0.5833`, spatial-phase compact-world-model mean success `0.4933` |
+### Proxy ablations
+| Artifact | File | Raw values |
+| --- | --- | --- |
+| compact-phase ablations | `artifacts/reports/reveal_phase_ablations_compact/ablations.json` | full `0.5133`, `no_geometry` `0.5133`, `no_spatial_memory` `0.4967`, `compact_world_model` `0.5133`, `no_planner` `0.4333`, `gaussian_candidates_only` `0.4667`, `no_task_head` `0.5133`, `no_support_mode_conditioning` `0.5133` |
+### RLBench direct-policy runs
+| Artifact | File | Raw values |
+| --- | --- | --- |
+| lift-ball wide checkpoint, one-step replanning | `artifacts/reports/rlbench_lift_ball_wide_len160_ep1_ik_c1/rollout_eval.json` | mean success `0.0`, mean return `0.0`, path recoveries `[148]`, noop fallbacks `[11]` |
+| push-box step-1 checkpoint, one-step replanning | `artifacts/reports/rlbench_push_box_step1_ep1_ik_c1/rollout_eval.json` | mean success `0.0`, mean return `0.0`, path recoveries `[177]`, noop fallbacks `[0]` |
+| push-box step-1 checkpoint, one-step replanning, `delta_scale=0.05` | `artifacts/reports/rlbench_push_box_step1_ep1_ik_c1_s005/rollout_eval.json` | mean success `0.0`, mean return `0.0`, path recoveries `[180]`, noop fallbacks `[0]` |
+### RLBench retrieval runs
+| Artifact | File | Raw values |
+| --- | --- | --- |
+| push-box kNN, `bank_stride=4`, `top_k=5`, `time_window=8`, `episodes=1` | `artifacts/reports/rlbench_push_box_knn_step1_ep1/rollout_eval.json` | mean success `1.0`, mean return `1.0`, bank size `2815` |
+| push-box kNN, `bank_stride=4`, `top_k=5`, `time_window=8`, `episodes=5` | `artifacts/reports/rlbench_push_box_knn_step1_ep5/rollout_eval.json` | successes `[0.0, 1.0, 0.0, 0.0, 0.0]`, mean success `0.2`, bank size `2815` |
+| push-box kNN, `bank_stride=1`, `top_k=1`, `time_window=4`, `episodes=5` | `artifacts/reports/rlbench_push_box_knn_step1_ep5_top1_dense/rollout_eval.json` | successes `[0.0, 0.0, 1.0, 1.0, 0.0]`, mean success `0.4`, bank size `11259` |
+## Environment Recreation Files
+- `environment/setup_same_machine.sh`
+- `environment/validate_same_machine.sh`
+- `environment/run_peract2_13_rollouts.sh`
+- `environment/runtime_env_vars.sh`
+- `environment/hardware_snapshot.txt`
+- `environment/glxinfo_B.txt`
+- `environment/upstream_revisions.txt`
+- `environment/system_packages_same_machine.txt`
+- `environment/rlbench_env_export.yaml`
+- `environment/rlbench_env_explicit.txt`
+- `environment/rlbench_pip_freeze.txt`
+- `environment/reveal_env_export.yaml`
+- `environment/reveal_env_explicit.txt`
+- `environment/reveal_pip_freeze.txt`
+Detailed raw tables for the `2026-03-25/26` work are in `results/session_results_20260326.md`.

metadata/source_sizes.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+13G	/workspace/VLAarchtests
+2.2G	/workspace/third_party/AnyBimanual
+54G	/workspace/baselines
+219M	/workspace/reports

metadata/staged_size.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 69G /workspace/hf_publish/VLAarchtests2

metadata/staged_tree_top2.txt ADDED Viewed

	@@ -0,0 +1,64 @@

+/workspace/hf_publish/VLAarchtests2/CHANGE_AND_TEST_LOG.md
+/workspace/hf_publish/VLAarchtests2/MODEL_AND_ARTIFACT_INDEX.md
+/workspace/hf_publish/VLAarchtests2/README.md
+/workspace/hf_publish/VLAarchtests2/RESULTS_RAW.md
+/workspace/hf_publish/VLAarchtests2/VLAarchtests
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/.cache
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/.gitattributes
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/.gitignore
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/.pytest_cache
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/FILE_MANIFEST.txt
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/MODEL_INDEX.md
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/README.md
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/artifacts
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/code
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/results
+/workspace/hf_publish/VLAarchtests2/VLAarchtests/tests
+/workspace/hf_publish/VLAarchtests2/baselines
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual_dummy_demo_root
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual_evalroot
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual_overlap_replay
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual_overlap_runs
+/workspace/hf_publish/VLAarchtests2/baselines/AnyBimanual_subset3_demo_root
+/workspace/hf_publish/VLAarchtests2/environment
+/workspace/hf_publish/VLAarchtests2/environment/base_pip_freeze.txt
+/workspace/hf_publish/VLAarchtests2/environment/base_python.txt
+/workspace/hf_publish/VLAarchtests2/environment/env_list.txt
+/workspace/hf_publish/VLAarchtests2/environment/hardware_snapshot.txt
+/workspace/hf_publish/VLAarchtests2/environment/nvidia_smi.txt
+/workspace/hf_publish/VLAarchtests2/environment/rlbench_pip_freeze.txt
+/workspace/hf_publish/VLAarchtests2/environment/rlbench_python.txt
+/workspace/hf_publish/VLAarchtests2/environment/runtime_env_vars.sh
+/workspace/hf_publish/VLAarchtests2/environment/setup_same_hardware.sh
+/workspace/hf_publish/VLAarchtests2/environment/uname.txt
+/workspace/hf_publish/VLAarchtests2/handoff
+/workspace/hf_publish/VLAarchtests2/handoff/instructions4.md
+/workspace/hf_publish/VLAarchtests2/history
+/workspace/hf_publish/VLAarchtests2/history/VLAarchtests_previous_README.md
+/workspace/hf_publish/VLAarchtests2/metadata
+/workspace/hf_publish/VLAarchtests2/metadata/source_sizes.txt
+/workspace/hf_publish/VLAarchtests2/metadata/staged_size.txt
+/workspace/hf_publish/VLAarchtests2/metadata/staged_tree_top2.txt
+/workspace/hf_publish/VLAarchtests2/reports
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_resume1000_chain.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_resume1000_eval.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_resume1000_eval_watcher.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_resume1000_train.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_nowandb2_train.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_nowandb3_eval.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_nowandb3_train.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_nowandb3_train_presavefix.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_nowandb_train.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_fixpretrain_train.log
+/workspace/hf_publish/VLAarchtests2/reports/anybimanual_subset3_overlap_smoke200_train.log
+/workspace/hf_publish/VLAarchtests2/reports/peract2_13_launch_smoke_live
+/workspace/hf_publish/VLAarchtests2/reports/rlbench_common23_exec_calib
+/workspace/hf_publish/VLAarchtests2/reports/rlbench_common23_exec_calib_iso
+/workspace/hf_publish/VLAarchtests2/reports/rlbench_debug_common23_pushbox_ep1
+/workspace/hf_publish/VLAarchtests2/reports/rlbench_general_debug
+/workspace/hf_publish/VLAarchtests2/reports/rlbench_subset3_common23_live_ep1
+/workspace/hf_publish/VLAarchtests2/reports/run_bag_selector_iter9_prebuild.log
+/workspace/hf_publish/VLAarchtests2/reports/true_baseline_compare_subset3_v1
+/workspace/hf_publish/VLAarchtests2/third_party
+/workspace/hf_publish/VLAarchtests2/third_party/AnyBimanual

third_party/AnyBimanual/agents/__init__.py ADDED Viewed

File without changes

third_party/AnyBimanual/agents/agent_factory.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import logging
+from omegaconf import DictConfig
+from yarr.agents.agent import BimanualAgent
+from yarr.agents.agent import LeaderFollowerAgent
+from yarr.agents.agent import Agent
+supported_agents = {"leader_follower": ("PERACT_BC", "RVT"),
+                    "independent" : ("PERACT_BC", "RVT"),
+                    "bimanual": ("BIMANUAL_PERACT", "ACT_BC_LANG"),
+                    "unimanual": ()}
+def create_agent(cfg: DictConfig) -> Agent:
+    method_name = cfg.method.name
+    agent_type = cfg.method.agent_type
+    logging.info("Using method %s with type %s", method_name, agent_type)
+    assert(method_name in supported_agents[agent_type])
+    agent_fn = agent_fn_by_name(method_name)
+    if agent_type == "leader_follower":
+        checkpoint_name_prefix = cfg.framework.checkpoint_name_prefix
+        cfg.method.robot_name = "right"
+        cfg.framework.checkpoint_name_prefix = f"{checkpoint_name_prefix}_{method_name.lower()}_leader"
+        leader_agent = agent_fn(cfg)
+        cfg.method.robot_name = "left"
+        cfg.framework.checkpoint_name_prefix = f"{checkpoint_name_prefix}_{method_name.lower()}_follower"
+        cfg.method.low_dim_size = cfg.method.low_dim_size + 8 # also add the action size
+        follower_agent = agent_fn(cfg)
+        cfg.method.robot_name = "bimanual"
+        return LeaderFollowerAgent(leader_agent, follower_agent)
+    elif agent_type == "independent":
+        checkpoint_name_prefix = cfg.framework.checkpoint_name_prefix
+        cfg.method.robot_name = "right"
+        cfg.framework.checkpoint_name_prefix = f"{checkpoint_name_prefix}_{method_name.lower()}_right"
+        right_agent = agent_fn(cfg)
+        cfg.method.robot_name = "left"
+        cfg.framework.checkpoint_name_prefix = f"{checkpoint_name_prefix}_{method_name.lower()}_left"
+        left_agent = agent_fn(cfg)
+        cfg.method.robot_name = "bimanual"
+        return BimanualAgent(right_agent, left_agent)
+    elif agent_type == "bimanual" or agent_type == "unimanual":
+        return agent_fn(cfg)
+    else:
+        raise Exception("invalid agent type")
+def agent_fn_by_name(method_name: str) -> Agent:
+    if method_name == "ARM":
+        from agents import arm
+        raise NotImplementedError("ARM not yet supported for eval.py")
+    elif method_name == "BC_LANG":
+        from agents.baselines import bc_lang
+        return bc_lang.launch_utils.create_agent
+    elif method_name == "VIT_BC_LANG":
+        from agents.baselines import vit_bc_lang
+        return vit_bc_lang.launch_utils.create_agent
+    elif method_name == "C2FARM_LINGUNET_BC":
+        from agents import c2farm_lingunet_bc
+        return c2farm_lingunet_bc.launch_utils.create_agent
+    elif method_name.startswith("PERACT_BC"):
+        from agents import peract_bc
+        return peract_bc.launch_utils.create_agent
+    elif method_name.startswith("BIMANUAL_PERACT"):
+        from agents import peract_bimanual
+        return peract_bimanual.launch_utils.create_agent
+    elif method_name.startswith("RVT"):
+        from agents import rvt
+        return rvt.launch_utils.create_agent
+    elif method_name.startswith("ACT_BC_LANG"):
+        from agents import act_bc_lang
+        return act_bc_lang.launch_utils.create_agent
+    elif method_name == "PERACT_RL":
+        raise NotImplementedError("PERACT_RL not yet supported for eval.py")
+    else:
+        raise ValueError("Method %s does not exists." % method_name)

third_party/AnyBimanual/agents/peract_bc/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ import agents.peract_bc.launch_utils

third_party/AnyBimanual/agents/peract_bc/launch_utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Adapted from ARM
+# Source: https://github.com/stepjam/ARM
+# License: https://github.com/stepjam/ARM/LICENSE
+from helpers.preprocess_agent import PreprocessAgent
+from agents.peract_bc.perceiver_lang_io import PerceiverVoxelLangEncoder
+from agents.peract_bc.qattention_peract_bc_agent import QAttentionPerActBCAgent
+from agents.peract_bc.qattention_stack_agent import QAttentionStackAgent
+import pickle
+import torch
+from agents.peract_bc.skill_manager import SkillManager
+from agents.peract_bc.visual_aligner import VisualAligner
+from omegaconf import DictConfig
+import os
+def create_agent(cfg: DictConfig):
+    LATENT_SIZE = 64
+    depth_0bounds = cfg.rlbench.scene_bounds
+    cam_resolution = cfg.rlbench.camera_resolution
+    num_rotation_classes = int(360.0 // cfg.method.rotation_resolution)
+    qattention_agents = []
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    pkl_path = os.path.join(current_dir, "../../lang_token.pkl")
+    pkl_path = os.path.abspath(pkl_path)
+    with open(pkl_path, "rb") as f:
+        embeddings_dict = pickle.load(f)
+    flattened_embeddings = []
+    for key in embeddings_dict.keys():
+        embedding = torch.tensor(embeddings_dict[key])
+        flattened_embedding = embedding.view(-1)
+        flattened_embeddings.append(flattened_embedding)
+    embeddings_matrix = torch.stack(flattened_embeddings)
+    # The released AnyBimanual checkpoints were trained with a wider skill-manager
+    # hidden size than the public repo currently hardcodes. Keep the original
+    # default for non-AnyBimanual runs, but use the released width when loading
+    # the AnyBimanual checkpoint family.
+    skill_manager_hidden_size = int(
+        getattr(cfg.framework, "skill_manager_hidden_size", 256 if cfg.framework.anybimanual else 128)
+    )
+    skill_manager = SkillManager(
+        num_classes=18,
+        embedding_matrix=embeddings_matrix,
+        hidden_size=skill_manager_hidden_size,
+    )
+    visual_aligner = VisualAligner()
+    for depth, vox_size in enumerate(cfg.method.voxel_sizes):
+        last = depth == len(cfg.method.voxel_sizes) - 1
+        perceiver_encoder = PerceiverVoxelLangEncoder(
+            depth=cfg.method.transformer_depth,
+            iterations=cfg.method.transformer_iterations,
+            voxel_size=vox_size,
+            initial_dim=3 + 3 + 1 + 3,
+            low_dim_size=cfg.method.low_dim_size,
+            layer=depth,
+            num_rotation_classes=num_rotation_classes if last else 0,
+            num_grip_classes=2 if last else 0,
+            num_collision_classes=2 if last else 0,
+            input_axis=3,
+            num_latents=cfg.method.num_latents,
+            latent_dim=cfg.method.latent_dim,
+            cross_heads=cfg.method.cross_heads,
+            latent_heads=cfg.method.latent_heads,
+            cross_dim_head=cfg.method.cross_dim_head,
+            latent_dim_head=cfg.method.latent_dim_head,
+            weight_tie_layers=False,
+            activation=cfg.method.activation,
+            pos_encoding_with_lang=cfg.method.pos_encoding_with_lang,
+            input_dropout=cfg.method.input_dropout,
+            attn_dropout=cfg.method.attn_dropout,
+            decoder_dropout=cfg.method.decoder_dropout,
+            lang_fusion_type=cfg.method.lang_fusion_type,
+            voxel_patch_size=cfg.method.voxel_patch_size,
+            voxel_patch_stride=cfg.method.voxel_patch_stride,
+            no_skip_connection=cfg.method.no_skip_connection,
+            no_perceiver=cfg.method.no_perceiver,
+            no_language=cfg.method.no_language,
+            final_dim=cfg.method.final_dim,
+            anybimanual=cfg.framework.anybimanual,
+            skill_manager = skill_manager,
+            visual_aligner = visual_aligner
+        )
+        qattention_agent = QAttentionPerActBCAgent(
+            layer=depth,
+            coordinate_bounds=depth_0bounds,
+            perceiver_encoder=perceiver_encoder,
+            camera_names=cfg.rlbench.cameras,
+            voxel_size=vox_size,
+            bounds_offset=cfg.method.bounds_offset[depth - 1] if depth > 0 else None,
+            image_crop_size=cfg.method.image_crop_size,
+            lr=cfg.method.lr,
+            training_iterations=cfg.framework.training_iterations,
+            lr_scheduler=cfg.method.lr_scheduler,
+            num_warmup_steps=cfg.method.num_warmup_steps,
+            trans_loss_weight=cfg.method.trans_loss_weight,
+            rot_loss_weight=cfg.method.rot_loss_weight,
+            grip_loss_weight=cfg.method.grip_loss_weight,
+            collision_loss_weight=cfg.method.collision_loss_weight,
+            include_low_dim_state=True,
+            image_resolution=cam_resolution,
+            batch_size=cfg.replay.batch_size,
+            voxel_feature_size=3,
+            lambda_weight_l2=cfg.method.lambda_weight_l2,
+            num_rotation_classes=num_rotation_classes,
+            rotation_resolution=cfg.method.rotation_resolution,
+            transform_augmentation=cfg.method.transform_augmentation.apply_se3,
+            transform_augmentation_xyz=cfg.method.transform_augmentation.aug_xyz,
+            transform_augmentation_rpy=cfg.method.transform_augmentation.aug_rpy,
+            transform_augmentation_rot_resolution=cfg.method.transform_augmentation.aug_rot_resolution,
+            optimizer_type=cfg.method.optimizer,
+            num_devices=cfg.ddp.num_devices,
+            checkpoint_name_prefix=cfg.framework.checkpoint_name_prefix,
+            anybimanual=cfg.framework.anybimanual,
+        )
+        qattention_agents.append(qattention_agent)
+    rotation_agent = QAttentionStackAgent(
+        qattention_agents=qattention_agents,
+        rotation_resolution=cfg.method.rotation_resolution,
+        camera_names=cfg.rlbench.cameras,
+    )
+    preprocess_agent = PreprocessAgent(pose_agent=rotation_agent)
+    return preprocess_agent

third_party/AnyBimanual/agents/peract_bc/perceiver_lang_io.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# Perceiver IO implementation adpated for manipulation
+# Source: https://github.com/lucidrains/perceiver-pytorch
+# License: https://github.com/lucidrains/perceiver-pytorch/blob/main/LICENSE
+import torch
+from torch import nn
+from einops import rearrange
+from einops import repeat
+import torch.nn.functional as F
+from perceiver_pytorch.perceiver_pytorch import cache_fn
+from perceiver_pytorch.perceiver_pytorch import PreNorm, FeedForward, Attention
+from helpers.network_utils import (
+    DenseBlock,
+    SpatialSoftmax3D,
+    Conv3DBlock,
+    Conv3DUpsampleBlock,
+)
+def symmetric_kl_divergence(left, right):
+    eps = 1e-2
+    left_prob = torch.clamp(F.log_softmax(left, dim=-1), min=-10, max=10)
+    right_prob = torch.clamp(F.log_softmax(right, dim=-1), min=-10, max=10)
+    kl_left_to_right = F.kl_div(left_prob, right_prob.exp(), reduction="batchmean")*eps
+    kl_right_to_left = F.kl_div(right_prob, left_prob.exp(), reduction="batchmean")*eps
+    symmetric_kl = -(kl_left_to_right + kl_right_to_left) / 2.0
+    return symmetric_kl
+def l1_norm(tensor):
+    return torch.sum(torch.abs(tensor)) + 1e-4 * torch.norm(tensor)
+def l2_1_norm(tensor):
+    l2_norm_per_skill = torch.norm(tensor, dim=-1)
+    return torch.sum(l2_norm_per_skill)
+# PerceiverIO adapted for 6-DoF manipulation
+class PerceiverVoxelLangEncoder(nn.Module):
+    def __init__(
+        self,
+        depth,  # number of self-attention layers
+        iterations,  # number cross-attention iterations (PerceiverIO uses just 1)
+        voxel_size,  # N voxels per side (size: N*N*N)
+        initial_dim,  # 10 dimensions - dimension of the input sequence to be encoded
+        low_dim_size,  # 4 dimensions - proprioception: {gripper_open, left_finger, right_finger, timestep}
+        layer=0,
+        num_rotation_classes=72,  # 5 degree increments (5*72=360) for each of the 3-axis
+        num_grip_classes=2,  # open or not open
+        num_collision_classes=2,  # collisions allowed or not allowed
+        input_axis=3,  # 3D tensors have 3 axes
+        num_latents=512,  # number of latent vectors
+        im_channels=64,  # intermediate channel size
+        latent_dim=512,  # dimensions of latent vectors
+        cross_heads=1,  # number of cross-attention heads
+        latent_heads=8,  # number of latent heads
+        cross_dim_head=64,
+        latent_dim_head=64,
+        activation="relu",
+        weight_tie_layers=False,
+        pos_encoding_with_lang=True,
+        input_dropout=0.1,
+        attn_dropout=0.1,
+        decoder_dropout=0.0,
+        lang_fusion_type="seq",
+        voxel_patch_size=9,
+        voxel_patch_stride=8,
+        no_skip_connection=False,
+        no_perceiver=False,
+        no_language=False,
+        final_dim=64,
+        anybimanual=False,
+        skill_manager=None,
+        visual_aligner=None,
+    ):
+        super().__init__()
+        self.depth = depth
+        self.layer = layer
+        self.init_dim = int(initial_dim)
+        self.iterations = iterations
+        self.input_axis = input_axis
+        self.voxel_size = voxel_size
+        self.low_dim_size = low_dim_size
+        self.im_channels = im_channels
+        self.pos_encoding_with_lang = pos_encoding_with_lang
+        self.lang_fusion_type = lang_fusion_type
+        self.voxel_patch_size = voxel_patch_size
+        self.voxel_patch_stride = voxel_patch_stride
+        self.num_rotation_classes = num_rotation_classes
+        self.num_grip_classes = num_grip_classes
+        self.num_collision_classes = num_collision_classes
+        self.final_dim = final_dim
+        self.input_dropout = input_dropout
+        self.attn_dropout = attn_dropout
+        self.decoder_dropout = decoder_dropout
+        self.no_skip_connection = no_skip_connection
+        self.no_perceiver = no_perceiver
+        self.no_language = no_language
+        self.anybimanual = anybimanual
+        self.skill_manager = skill_manager
+        self.visual_aligner = visual_aligner
+        # patchified input dimensions
+        spatial_size = voxel_size // self.voxel_patch_stride  # 100/5 = 20
+        # 64 voxel features + 64 proprio features (+ 64 lang goal features if concattenated)
+        self.input_dim_before_seq = (
+            self.im_channels * 3
+            if self.lang_fusion_type == "concat"
+            else self.im_channels * 2
+        )
+        if self.anybimanual:
+            self.input_dim_before_seq_ = self.input_dim_before_seq*2
+        else:
+            self.input_dim_before_seq_ = self.input_dim_before_seq
+        # CLIP language feature dimensions
+        if self.anybimanual:
+            lang_feat_dim, lang_emb_dim, lang_max_seq_len = 1024, 512, 154
+        else:
+            lang_feat_dim, lang_emb_dim, lang_max_seq_len = 1024, 512, 77
+        self.lang_max_seq_len = lang_max_seq_len
+        # learnable positional encoding
+        if self.pos_encoding_with_lang:
+            self.pos_encoding = nn.Parameter(
+                torch.randn(
+                    1, lang_max_seq_len + spatial_size**3, self.input_dim_before_seq
+                )
+            )
+        else:
+            # assert self.lang_fusion_type == 'concat', 'Only concat is supported for pos encoding without lang.'
+            self.pos_encoding = nn.Parameter(
+                torch.randn(
+                    1,
+                    spatial_size,
+                    spatial_size,
+                    spatial_size,
+                    self.input_dim_before_seq,
+                )
+            )
+        # voxel input preprocessing 1x1 conv encoder
+        self.input_preprocess = Conv3DBlock(
+            self.init_dim,
+            self.im_channels,
+            kernel_sizes=1,
+            strides=1,
+            norm=None,
+            activation=activation,
+        )
+        # patchify conv
+        self.patchify = Conv3DBlock(
+            self.input_preprocess.out_channels,
+            self.im_channels,
+            kernel_sizes=self.voxel_patch_size,
+            strides=self.voxel_patch_stride,
+            norm=None,
+            activation=activation,
+        )
+        # language preprocess
+        if self.lang_fusion_type == "concat":
+            self.lang_preprocess = nn.Linear(lang_feat_dim, self.im_channels)
+        elif self.lang_fusion_type == "seq":
+            self.lang_preprocess = nn.Linear(lang_emb_dim, self.im_channels * 2)
+        # proprioception
+        if self.low_dim_size > 0:
+            self.proprio_preprocess = DenseBlock(
+                self.low_dim_size,
+                self.im_channels,
+                norm=None,
+                activation=activation,
+            )
+        # pooling functions
+        self.local_maxp = nn.MaxPool3d(3, 2, padding=1)
+        self.global_maxp = nn.AdaptiveMaxPool3d(1)
+        # 1st 3D softmax
+        self.ss0 = SpatialSoftmax3D(
+            self.voxel_size, self.voxel_size, self.voxel_size, self.im_channels
+        )
+        flat_size = self.im_channels * 4
+        # latent vectors (that are randomly initialized)
+        self.latents = nn.Parameter(torch.randn(num_latents, latent_dim))
+        # encoder cross attention
+        self.cross_attend_blocks = nn.ModuleList(
+            [
+                PreNorm(
+                    latent_dim,
+                    Attention(
+                        latent_dim,
+                        self.input_dim_before_seq_,
+                        heads=cross_heads,
+                        dim_head=cross_dim_head,
+                        dropout=input_dropout,
+                    ),
+                    context_dim=self.input_dim_before_seq_,
+                ),
+                PreNorm(latent_dim, FeedForward(latent_dim)),
+            ]
+        )
+        get_latent_attn = lambda: PreNorm(
+            latent_dim,
+            Attention(
+                latent_dim,
+                heads=latent_heads,
+                dim_head=latent_dim_head,
+                dropout=attn_dropout,
+            ),
+        )
+        get_latent_ff = lambda: PreNorm(latent_dim, FeedForward(latent_dim))
+        get_latent_attn, get_latent_ff = map(cache_fn, (get_latent_attn, get_latent_ff))
+        # self attention layers
+        self.layers = nn.ModuleList([])
+        cache_args = {"_cache": weight_tie_layers}
+        for i in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [get_latent_attn(**cache_args), get_latent_ff(**cache_args)]
+                )
+            )
+        # decoder cross attention
+        self.decoder_cross_attn = PreNorm(
+            self.input_dim_before_seq_,
+            Attention(
+                self.input_dim_before_seq_,
+                latent_dim,
+                heads=cross_heads,
+                dim_head=cross_dim_head,
+                dropout=decoder_dropout,
+            ),
+            context_dim=latent_dim,
+        )
+        # upsample conv
+        self.up0 = Conv3DUpsampleBlock(
+            self.input_dim_before_seq_,
+            self.final_dim,
+            kernel_sizes=self.voxel_patch_size,
+            strides=self.voxel_patch_stride,
+            norm=None,
+            activation=activation,
+        )
+        # 2nd 3D softmax
+        self.ss1 = SpatialSoftmax3D(
+            spatial_size, spatial_size, spatial_size, self.input_dim_before_seq_
+        )
+        flat_size += self.input_dim_before_seq_ * 4
+        # final 3D softmax
+        self.final = Conv3DBlock(
+            self.im_channels
+            if (self.no_perceiver or self.no_skip_connection)
+            else self.im_channels * 2,
+            self.im_channels,
+            kernel_sizes=3,
+            strides=1,
+            norm=None,
+            activation=activation,
+        )
+        self.trans_decoder = Conv3DBlock(
+            self.final_dim,
+            1,
+            kernel_sizes=3,
+            strides=1,
+            norm=None,
+            activation=None,
+        )
+        # rotation, gripper, and collision MLP layers
+        if self.num_rotation_classes > 0:
+            self.ss_final = SpatialSoftmax3D(
+                self.voxel_size, self.voxel_size, self.voxel_size, self.im_channels
+            )
+            flat_size += self.im_channels * 4
+            self.dense0 = DenseBlock(flat_size, 256, None, activation)
+            self.dense1 = DenseBlock(256, self.final_dim, None, activation)
+            self.rot_grip_collision_ff = DenseBlock(
+                self.final_dim,
+                self.num_rotation_classes * 3
+                + self.num_grip_classes
+                + self.num_collision_classes,
+                None,
+                None,
+            )
+    def encode_text(self, x):
+        with torch.no_grad():
+            text_feat, text_emb = self._clip_rn50.encode_text_with_embeddings(x)
+        text_feat = text_feat.detach()
+        text_emb = text_emb.detach()
+        text_mask = torch.where(x == 0, x, 1)  # [1, max_token_len]
+        return text_feat, text_emb
+    def forward(
+        self,
+        ins,
+        proprio,
+        lang_goal_emb,
+        lang_token_embs,
+        prev_layer_voxel_grid,
+        bounds,
+        prev_layer_bounds,
+        mask=None,
+        arm=None,
+    ):
+        # preprocess input
+        d0 = self.input_preprocess(ins)  # [B,10,100,100,100] -> [B,64,100,100,100]
+        # aggregated features from 1st softmax and maxpool for MLP decoders
+        feats = [self.ss0(d0.contiguous()), self.global_maxp(d0).view(ins.shape[0], -1)]
+        # patchify input (5x5x5 patches)
+        ins = self.patchify(d0)  # [B,64,100,100,100] -> [B,64,20,20,20]
+        b, c, d, h, w, device = *ins.shape, ins.device
+        axis = [d, h, w]
+        assert (
+            len(axis) == self.input_axis
+        ), "input must have the same number of axis as input_axis"
+        # concat proprio
+        if self.low_dim_size > 0:
+            p = self.proprio_preprocess(proprio)  # [B,4] -> [B,64]
+            p = p.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, 1, d, h, w)
+            ins = torch.cat([ins, p], dim=1)  # [B,128,20,20,20]
+        # language ablation
+        if self.no_language:
+            lang_goal_emb = torch.zeros_like(lang_goal_emb)
+            lang_token_embs = torch.zeros_like(lang_token_embs)
+        # option 1: tile and concat lang goal to input
+        if self.lang_fusion_type == "concat":
+            lang_emb = lang_goal_emb
+            lang_emb = lang_emb.to(dtype=ins.dtype)
+            l = self.lang_preprocess(lang_emb)
+            l = l.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, 1, d, h, w)
+            ins = torch.cat([ins, l], dim=1)
+        # channel last
+        ins = rearrange(ins, "b d ... -> b ... d")  # [B,20,20,20,128]
+        # add pos encoding to grid
+        if not self.pos_encoding_with_lang:
+            ins = ins + self.pos_encoding
+        ######################## NOTE #############################
+        # NOTE: If you add positional encodings ^here the lang embs
+        # won't have positional encodings. I accidently forgot
+        # to turn this off for all the experiments in the paper.
+        # So I guess those models were using language embs
+        # as a bag of words :( But it doesn't matter much for
+        # RLBench tasks since we don't test for novel instructions
+        # at test time anyway. The recommend way is to add
+        # positional encodings to the final input sequence
+        # fed into the Perceiver Transformer, as done below
+        # (and also in the Colab tutorial).
+        ###########################################################
+        # concat to channels of and flatten axis
+        queries_orig_shape = ins.shape
+        # rearrange input to be channel last
+        ins = rearrange(ins, "b ... d -> b (...) d")  # [B,8000,128]
+        ins_wo_prev_layers = ins
+        # option 2: add lang token embs as a sequence
+        if self.anybimanual:
+            l = self.lang_preprocess(lang_token_embs)  # [B,77,512] -> [B,77,128]
+            mask_right, mask_left = self.visual_aligner(ins)
+            L_voxel = symmetric_kl_divergence(mask_left, mask_right)
+            right_skill = self.skill_manager(mask_right, l)
+            left_skill = self.skill_manager(mask_left, l)
+            right_skill = self.lang_preprocess(right_skill)
+            left_skill = self.lang_preprocess(left_skill)
+            L_skill = (
+                l1_norm(left_skill) + l1_norm(right_skill) +
+                0.01 * (l2_1_norm(left_skill) + l2_1_norm(right_skill))
+            )
+            l_right = torch.cat((right_skill, l), dim=1)
+            ins_right = torch.cat((l_right, mask_right), dim=1)
+            l_left = torch.cat((left_skill, l), dim=1)
+            ins_left = torch.cat((l_left, mask_left), dim=1)
+            if arm == "right":
+                skill = right_skill
+                ins_ = ins_right
+            else:
+                skill = left_skill
+                ins_ = ins_left
+            if self.pos_encoding_with_lang:
+                ins_ = ins_ + self.pos_encoding
+        else:
+            if self.lang_fusion_type == "seq":
+                l = self.lang_preprocess(lang_token_embs)  # [B,77,1024] -> [B,77,128]
+                ins = torch.cat((l, ins), dim=1)  # [B,8077,128]
+            # add pos encoding to language + flattened grid (the recommended way)
+            if self.pos_encoding_with_lang:
+                ins = ins + self.pos_encoding
+        if self.anybimanual:
+            skill_l = torch.cat((skill, l), dim=1)
+            ins = torch.cat((skill_l, ins),dim=1)
+            ins = torch.cat((ins_, ins),dim=2)
+        # batchify latents
+        x = repeat(self.latents, "n d -> b n d", b=b)
+        cross_attn, cross_ff = self.cross_attend_blocks
+        for it in range(self.iterations):
+            # encoder cross attention
+            x = cross_attn(x, context=ins, mask=mask) + x
+            x = cross_ff(x) + x
+            # self-attention layers
+            for self_attn, self_ff in self.layers:
+                x = self_attn(x) + x
+                x = self_ff(x) + x
+        # decoder cross attention
+        latents = self.decoder_cross_attn(ins, context=x)
+        # crop out the language part of the output sequence
+        if self.lang_fusion_type == "seq":
+            latents = latents[:, self.lang_max_seq_len :]
+        # reshape back to voxel grid
+        latents = latents.view(
+            b, *queries_orig_shape[1:-1], latents.shape[-1]
+        )  # [B,20,20,20,64]
+        latents = rearrange(latents, "b ... d -> b d ...")  # [B,64,20,20,20]
+        # aggregated features from 2nd softmax and maxpool for MLP decoders
+        feats.extend(
+            [self.ss1(latents.contiguous()), self.global_maxp(latents).view(b, -1)]
+        )
+        # upsample
+        u0 = self.up0(latents)
+        # ablations
+        if self.no_skip_connection:
+            u = self.final(u0)
+        elif self.no_perceiver:
+            u = self.final(d0)
+        else:
+            u = self.final(torch.cat([d0, u0], dim=1))
+        # translation decoder
+        trans = self.trans_decoder(u)
+        # rotation, gripper, and collision MLPs
+        rot_and_grip_out = None
+        if self.num_rotation_classes > 0:
+            feats.extend(
+                [self.ss_final(u.contiguous()), self.global_maxp(u).view(b, -1)]
+            )
+            dense0 = self.dense0(torch.cat(feats, dim=1))
+            dense1 = self.dense1(dense0)  # [B,72*3+2+2]
+            rot_and_grip_collision_out = self.rot_grip_collision_ff(dense1)
+            rot_and_grip_out = rot_and_grip_collision_out[
+                :, : -self.num_collision_classes
+            ]
+            collision_out = rot_and_grip_collision_out[:, -self.num_collision_classes :]
+        return trans, rot_and_grip_out, collision_out

third_party/AnyBimanual/agents/peract_bc/qattention_peract_bc_agent.py ADDED Viewed

	@@ -0,0 +1,939 @@

+import copy
+import logging
+import os
+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from pytorch3d import transforms as torch3d_tf
+from yarr.agents.agent import (
+    Agent,
+    ActResult,
+    ScalarSummary,
+    HistogramSummary,
+    ImageSummary,
+    Summary,
+)
+import matplotlib.pyplot as plt
+import PIL.Image as Image
+import wandb
+import io
+from termcolor import colored, cprint
+from helpers import utils
+from helpers.utils import visualise_voxel, stack_on_channel
+from voxel.voxel_grid import VoxelGrid
+from einops import rearrange
+from helpers.clip.core.clip import build_model, load_clip
+import transformers
+from helpers.optim.lamb import Lamb
+from torch.nn.parallel import DistributedDataParallel as DDP
+class QFunction(nn.Module):
+    def __init__(
+        self,
+        perceiver_encoder: nn.Module,
+        voxelizer: VoxelGrid,
+        bounds_offset: float,
+        rotation_resolution: float,
+        device,
+        training,
+    ):
+        super(QFunction, self).__init__()
+        self._rotation_resolution = rotation_resolution
+        self._voxelizer = voxelizer
+        self._bounds_offset = bounds_offset
+        self._qnet = perceiver_encoder.to(device)
+        # distributed training
+        if training:
+            self._qnet = DDP(self._qnet, device_ids=[device], find_unused_parameters=True)
+    def _argmax_3d(self, tensor_orig):
+        b, c, d, h, w = tensor_orig.shape  # c will be one
+        idxs = tensor_orig.view(b, c, -1).argmax(-1)
+        indices = torch.cat([((idxs // h) // d), (idxs // h) % w, idxs % w], 1)
+        return indices
+    def choose_highest_action(self, q_trans, q_rot_grip, q_collision):
+        coords = self._argmax_3d(q_trans)
+        rot_and_grip_indicies = None
+        ignore_collision = None
+        if q_rot_grip is not None:
+            q_rot = torch.stack(
+                torch.split(
+                    q_rot_grip[:, :-2], int(360 // self._rotation_resolution), dim=1
+                ),
+                dim=1,
+            )
+            rot_and_grip_indicies = torch.cat(
+                [
+                    q_rot[:, 0:1].argmax(-1),
+                    q_rot[:, 1:2].argmax(-1),
+                    q_rot[:, 2:3].argmax(-1),
+                    q_rot_grip[:, -2:].argmax(-1, keepdim=True),
+                ],
+                -1,
+            )
+            ignore_collision = q_collision[:, -2:].argmax(-1, keepdim=True)
+        return coords, rot_and_grip_indicies, ignore_collision
+    def forward(
+        self,
+        rgb_pcd,
+        proprio,
+        pcd,
+        lang_goal_emb,
+        lang_token_embs,
+        bounds=None,
+        prev_bounds=None,
+        prev_layer_voxel_grid=None,
+        arm=None,
+    ):
+        # rgb_pcd will be list of list (list of [rgb, pcd])
+        b = rgb_pcd[0][0].shape[0]
+        pcd_flat = torch.cat([p.permute(0, 2, 3, 1).reshape(b, -1, 3) for p in pcd], 1)
+        # flatten RGBs and Pointclouds
+        rgb = [rp[0] for rp in rgb_pcd]
+        feat_size = rgb[0].shape[1]
+        flat_imag_features = torch.cat(
+            [p.permute(0, 2, 3, 1).reshape(b, -1, feat_size) for p in rgb], 1
+        )
+        # construct voxel grid
+        voxel_grid = self._voxelizer.coords_to_bounding_voxel_grid(
+            pcd_flat, coord_features=flat_imag_features, coord_bounds=bounds
+        )
+        # swap to channels fist
+        voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach()
+        # batch bounds if necessary
+        if bounds.shape[0] != b:
+            bounds = bounds.repeat(b, 1)
+        # forward pass
+        q_trans, q_rot_and_grip, q_ignore_collisions = self._qnet(
+            voxel_grid,
+            proprio,
+            lang_goal_emb,
+            lang_token_embs,
+            prev_layer_voxel_grid,
+            bounds,
+            prev_bounds,
+            arm=arm,
+        )
+        return q_trans, q_rot_and_grip, q_ignore_collisions, voxel_grid
+class QAttentionPerActBCAgent(Agent):
+    def __init__(
+        self,
+        layer: int,
+        coordinate_bounds: list,
+        perceiver_encoder: nn.Module,
+        camera_names: list,
+        batch_size: int,
+        voxel_size: int,
+        bounds_offset: float,
+        voxel_feature_size: int,
+        image_crop_size: int,
+        num_rotation_classes: int,
+        rotation_resolution: float,
+        lr: float = 0.0001,
+        lr_scheduler: bool = False,
+        training_iterations: int = 100000,
+        num_warmup_steps: int = 20000,
+        trans_loss_weight: float = 1.0,
+        rot_loss_weight: float = 1.0,
+        grip_loss_weight: float = 1.0,
+        collision_loss_weight: float = 1.0,
+        include_low_dim_state: bool = False,
+        image_resolution: list = None,
+        lambda_weight_l2: float = 0.0,
+        transform_augmentation: bool = True,
+        transform_augmentation_xyz: list = [0.0, 0.0, 0.0],
+        transform_augmentation_rpy: list = [0.0, 0.0, 180.0],
+        transform_augmentation_rot_resolution: int = 5,
+        optimizer_type: str = "adam",
+        num_devices: int = 1,
+        checkpoint_name_prefix=None,
+        anybimanual = False,
+        cfg=None,
+    ):
+        self._layer = layer
+        self._coordinate_bounds = coordinate_bounds
+        self._perceiver_encoder = perceiver_encoder
+        self._voxel_feature_size = voxel_feature_size
+        self._bounds_offset = bounds_offset
+        self._image_crop_size = image_crop_size
+        self._lr = lr
+        self._lr_scheduler = lr_scheduler
+        self._training_iterations = training_iterations
+        self._num_warmup_steps = num_warmup_steps
+        self._trans_loss_weight = trans_loss_weight
+        self._rot_loss_weight = rot_loss_weight
+        self._grip_loss_weight = grip_loss_weight
+        self._collision_loss_weight = collision_loss_weight
+        self._include_low_dim_state = include_low_dim_state
+        self._image_resolution = image_resolution or [128, 128]
+        self._voxel_size = voxel_size
+        self._camera_names = camera_names
+        self._num_cameras = len(camera_names)
+        self._batch_size = batch_size
+        self._lambda_weight_l2 = lambda_weight_l2
+        self._transform_augmentation = transform_augmentation
+        self._transform_augmentation_xyz = torch.from_numpy(
+            np.array(transform_augmentation_xyz)
+        )
+        self._transform_augmentation_rpy = transform_augmentation_rpy
+        self._transform_augmentation_rot_resolution = (
+            transform_augmentation_rot_resolution
+        )
+        self._optimizer_type = optimizer_type
+        self._num_devices = num_devices
+        self._num_rotation_classes = num_rotation_classes
+        self._rotation_resolution = rotation_resolution
+        self._cross_entropy_loss = nn.CrossEntropyLoss(reduction="none")
+        checkpoint_name_prefix = checkpoint_name_prefix or "QAttentionAgent"
+        self._name = f"{checkpoint_name_prefix}_layer_{self._layer}"
+        self.anybimanual = anybimanual
+        self.cfg=cfg
+    def build(self, training: bool, device: torch.device = None):
+        self._training = training
+        if device is None:
+            device = torch.device("cpu")
+        self._device = device
+        self._voxelizer = VoxelGrid(
+            coord_bounds=self._coordinate_bounds,
+            voxel_size=self._voxel_size,
+            device=device,
+            batch_size=self._batch_size if training else 1,
+            feature_size=self._voxel_feature_size,
+            max_num_coords=np.prod(self._image_resolution) * self._num_cameras,
+        )
+        self._q = (
+            QFunction(
+                self._perceiver_encoder,
+                self._voxelizer,
+                self._bounds_offset,
+                self._rotation_resolution,
+                device,
+                training,
+            )
+            .to(device)
+            .train(training)
+        )
+        grid_for_crop = (
+            torch.arange(0, self._image_crop_size, device=device)
+            .unsqueeze(0)
+            .repeat(self._image_crop_size, 1)
+            .unsqueeze(-1)
+        )
+        self._grid_for_crop = torch.cat(
+            [grid_for_crop.transpose(1, 0), grid_for_crop], dim=2
+        ).unsqueeze(0)
+        self._coordinate_bounds = torch.tensor(
+            self._coordinate_bounds, device=device
+        ).unsqueeze(0)
+        if self._training:
+            # optimizer
+            if self._optimizer_type == "lamb":
+                self._optimizer = Lamb(
+                    self._q.parameters(),
+                    lr=self._lr,
+                    weight_decay=self._lambda_weight_l2,
+                    betas=(0.9, 0.999),
+                    adam=False,
+                )
+            elif self._optimizer_type == "adam":
+                self._optimizer = torch.optim.Adam(
+                    self._q.parameters(),
+                    lr=self._lr,
+                    weight_decay=self._lambda_weight_l2,
+                )
+            else:
+                raise Exception("Unknown optimizer type")
+            # learning rate scheduler
+            if self._lr_scheduler:
+                self._scheduler = (
+                    transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
+                        self._optimizer,
+                        num_warmup_steps=self._num_warmup_steps,
+                        num_training_steps=self._training_iterations,
+                        num_cycles=self._training_iterations // 10000,
+                    )
+                )
+            # one-hot zero tensors
+            self._action_trans_one_hot_zeros = torch.zeros(
+                (
+                    self._batch_size,
+                    1,
+                    self._voxel_size,
+                    self._voxel_size,
+                    self._voxel_size,
+                ),
+                dtype=int,
+                device=device,
+            )
+            self._action_rot_x_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_rot_y_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_rot_z_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_grip_one_hot_zeros = torch.zeros(
+                (self._batch_size, 2), dtype=int, device=device
+            )
+            self._action_ignore_collisions_one_hot_zeros = torch.zeros(
+                (self._batch_size, 2), dtype=int, device=device
+            )
+            # print total params
+            logging.info(
+                "# Q Params: %d"
+                % sum(
+                    p.numel()
+                    for name, p in self._q.named_parameters()
+                    if p.requires_grad and "clip" not in name
+                )
+            )
+        else:
+            for param in self._q.parameters():
+                param.requires_grad = False
+            # load CLIP for encoding language goals during evaluation
+            model, _ = load_clip("RN50", jit=False)
+            self._clip_rn50 = build_model(model.state_dict())
+            self._clip_rn50 = self._clip_rn50.float().to(device)
+            self._clip_rn50.eval()
+            del model
+            self._voxelizer.to(device)
+            self._q.to(device)
+    def _extract_crop(self, pixel_action, observation):
+        # Pixel action will now be (B, 2)
+        # observation = stack_on_channel(observation)
+        h = observation.shape[-1]
+        top_left_corner = torch.clamp(
+            pixel_action - self._image_crop_size // 2, 0, h - self._image_crop_size
+        )
+        grid = self._grid_for_crop + top_left_corner.unsqueeze(1)
+        grid = ((grid / float(h)) * 2.0) - 1.0  # between -1 and 1
+        # Used for cropping the images across a batch
+        # swap fro y x, to x, y
+        grid = torch.cat((grid[:, :, :, 1:2], grid[:, :, :, 0:1]), dim=-1)
+        crop = F.grid_sample(observation, grid, mode="nearest", align_corners=True)
+        return crop
+    def _preprocess_inputs(self, replay_sample):
+        obs = []
+        pcds = []
+        self._crop_summary = []
+        for n in self._camera_names:
+            rgb = replay_sample["%s_rgb" % n]
+            pcd = replay_sample["%s_point_cloud" % n]
+            obs.append([rgb, pcd])
+            pcds.append(pcd)
+        return obs, pcds
+    def _act_preprocess_inputs(self, observation):
+        obs, pcds = [], []
+        for n in self._camera_names:
+            rgb = observation["%s_rgb" % n]
+            pcd = observation["%s_point_cloud" % n]
+            obs.append([rgb, pcd])
+            pcds.append(pcd)
+        return obs, pcds
+    def _get_value_from_voxel_index(self, q, voxel_idx):
+        b, c, d, h, w = q.shape
+        q_trans_flat = q.view(b, c, d * h * w)
+        flat_indicies = (
+            voxel_idx[:, 0] * d * h + voxel_idx[:, 1] * h + voxel_idx[:, 2]
+        )[:, None].int()
+        highest_idxs = flat_indicies.unsqueeze(-1).repeat(1, c, 1)
+        chosen_voxel_values = q_trans_flat.gather(2, highest_idxs)[
+            ..., 0
+        ]  # (B, trans + rot + grip)
+        return chosen_voxel_values
+    def _get_value_from_rot_and_grip(self, rot_grip_q, rot_and_grip_idx):
+        q_rot = torch.stack(
+            torch.split(
+                rot_grip_q[:, :-2], int(360 // self._rotation_resolution), dim=1
+            ),
+            dim=1,
+        )  # B, 3, 72
+        q_grip = rot_grip_q[:, -2:]
+        rot_and_grip_values = torch.cat(
+            [
+                q_rot[:, 0].gather(1, rot_and_grip_idx[:, 0:1]),
+                q_rot[:, 1].gather(1, rot_and_grip_idx[:, 1:2]),
+                q_rot[:, 2].gather(1, rot_and_grip_idx[:, 2:3]),
+                q_grip.gather(1, rot_and_grip_idx[:, 3:4]),
+            ],
+            -1,
+        )
+        return rot_and_grip_values
+    def _celoss(self, pred, labels):
+        return self._cross_entropy_loss(pred, labels.argmax(-1))
+    def _softmax_q_trans(self, q):
+        q_shape = q.shape
+        return F.softmax(q.reshape(q_shape[0], -1), dim=1).reshape(q_shape)
+    def _softmax_q_rot_grip(self, q_rot_grip):
+        q_rot_x_flat = q_rot_grip[
+            :, 0 * self._num_rotation_classes : 1 * self._num_rotation_classes
+        ]
+        q_rot_y_flat = q_rot_grip[
+            :, 1 * self._num_rotation_classes : 2 * self._num_rotation_classes
+        ]
+        q_rot_z_flat = q_rot_grip[
+            :, 2 * self._num_rotation_classes : 3 * self._num_rotation_classes
+        ]
+        q_grip_flat = q_rot_grip[:, 3 * self._num_rotation_classes :]
+        q_rot_x_flat_softmax = F.softmax(q_rot_x_flat, dim=1)
+        q_rot_y_flat_softmax = F.softmax(q_rot_y_flat, dim=1)
+        q_rot_z_flat_softmax = F.softmax(q_rot_z_flat, dim=1)
+        q_grip_flat_softmax = F.softmax(q_grip_flat, dim=1)
+        return torch.cat(
+            [
+                q_rot_x_flat_softmax,
+                q_rot_y_flat_softmax,
+                q_rot_z_flat_softmax,
+                q_grip_flat_softmax,
+            ],
+            dim=1,
+        )
+    def _softmax_ignore_collision(self, q_collision):
+        q_collision_softmax = F.softmax(q_collision, dim=1)
+        return q_collision_softmax
+    def update(self, step: int, replay_sample: dict) -> dict:
+        action_trans = replay_sample["trans_action_indicies"][
+            :, self._layer * 3 : self._layer * 3 + 3
+        ].int()
+        action_rot_grip = replay_sample["rot_grip_action_indicies"].int()
+        action_gripper_pose = replay_sample["gripper_pose"]
+        action_ignore_collisions = replay_sample["ignore_collisions"].int()
+        lang_goal_emb = replay_sample["lang_goal_emb"].float()
+        lang_token_embs = replay_sample["lang_token_embs"].float()
+        prev_layer_voxel_grid = replay_sample.get("prev_layer_voxel_grid", None)
+        prev_layer_bounds = replay_sample.get("prev_layer_bounds", None)
+        device = self._device
+        rank = device
+        bounds = self._coordinate_bounds.to(device)
+        if self._layer > 0:
+            cp = replay_sample["attention_coordinate_layer_%d" % (self._layer - 1)]
+            bounds = torch.cat(
+                [cp - self._bounds_offset, cp + self._bounds_offset], dim=1
+            )
+        proprio = None
+        if self._include_low_dim_state:
+            proprio = replay_sample["low_dim_state"]
+        obs, pcd = self._preprocess_inputs(replay_sample)
+        if proprio.shape[-1] == 4:
+            arm = "right"
+        else:
+            arm = "left"
+        # batch size
+        bs = pcd[0].shape[0]
+        # SE(3) augmentation of point clouds and actions
+        if self._transform_augmentation:
+            from voxel import augmentation
+            action_trans, action_rot_grip, pcd = augmentation.apply_se3_augmentation(
+                pcd,
+                action_gripper_pose,
+                action_trans,
+                action_rot_grip,
+                bounds,
+                self._layer,
+                self._transform_augmentation_xyz,
+                self._transform_augmentation_rpy,
+                self._transform_augmentation_rot_resolution,
+                self._voxel_size,
+                self._rotation_resolution,
+                self._device,
+            )
+        # forward pass
+        q_trans, q_rot_grip, q_collision, voxel_grid = self._q(
+            obs,
+            proprio,
+            pcd,
+            lang_goal_emb,
+            lang_token_embs,
+            bounds,
+            prev_layer_bounds,
+            prev_layer_voxel_grid,
+            arm=arm,
+        )
+        # argmax to choose best action
+        (
+            coords,
+            rot_and_grip_indicies,
+            ignore_collision_indicies,
+        ) = self._q.choose_highest_action(q_trans, q_rot_grip, q_collision)
+        q_trans_loss, q_rot_loss, q_grip_loss, q_collision_loss = 0.0, 0.0, 0.0, 0.0
+        # translation one-hot
+        action_trans_one_hot = self._action_trans_one_hot_zeros.clone()
+        for b in range(bs):
+            gt_coord = action_trans[b, :].int()
+            action_trans_one_hot[b, :, gt_coord[0], gt_coord[1], gt_coord[2]] = 1
+        # translation loss
+        q_trans_flat = q_trans.view(bs, -1)
+        action_trans_one_hot_flat = action_trans_one_hot.view(bs, -1)
+        q_trans_loss = self._celoss(q_trans_flat, action_trans_one_hot_flat)
+        with_rot_and_grip = rot_and_grip_indicies is not None
+        if with_rot_and_grip:
+            # rotation, gripper, and collision one-hots
+            action_rot_x_one_hot = self._action_rot_x_one_hot_zeros.clone()
+            action_rot_y_one_hot = self._action_rot_y_one_hot_zeros.clone()
+            action_rot_z_one_hot = self._action_rot_z_one_hot_zeros.clone()
+            action_grip_one_hot = self._action_grip_one_hot_zeros.clone()
+            action_ignore_collisions_one_hot = (
+                self._action_ignore_collisions_one_hot_zeros.clone()
+            )
+            for b in range(bs):
+                gt_rot_grip = action_rot_grip[b, :].int()
+                action_rot_x_one_hot[b, gt_rot_grip[0]] = 1
+                action_rot_y_one_hot[b, gt_rot_grip[1]] = 1
+                action_rot_z_one_hot[b, gt_rot_grip[2]] = 1
+                action_grip_one_hot[b, gt_rot_grip[3]] = 1
+                gt_ignore_collisions = action_ignore_collisions[b, :].int()
+                action_ignore_collisions_one_hot[b, gt_ignore_collisions[0]] = 1
+            # flatten predictions
+            q_rot_x_flat = q_rot_grip[
+                :, 0 * self._num_rotation_classes : 1 * self._num_rotation_classes
+            ]
+            q_rot_y_flat = q_rot_grip[
+                :, 1 * self._num_rotation_classes : 2 * self._num_rotation_classes
+            ]
+            q_rot_z_flat = q_rot_grip[
+                :, 2 * self._num_rotation_classes : 3 * self._num_rotation_classes
+            ]
+            q_grip_flat = q_rot_grip[:, 3 * self._num_rotation_classes :]
+            q_ignore_collisions_flat = q_collision
+            # rotation loss
+            q_rot_loss += self._celoss(q_rot_x_flat, action_rot_x_one_hot)
+            q_rot_loss += self._celoss(q_rot_y_flat, action_rot_y_one_hot)
+            q_rot_loss += self._celoss(q_rot_z_flat, action_rot_z_one_hot)
+            # gripper loss
+            q_grip_loss += self._celoss(q_grip_flat, action_grip_one_hot)
+            # collision loss
+            q_collision_loss += self._celoss(
+                q_ignore_collisions_flat, action_ignore_collisions_one_hot
+            )
+        combined_losses = (
+            (q_trans_loss * self._trans_loss_weight)
+            + (q_rot_loss * self._rot_loss_weight)
+            + (q_grip_loss * self._grip_loss_weight)
+            + (q_collision_loss * self._collision_loss_weight)
+        )
+        total_loss = combined_losses.mean()
+        if step % 10 == 0 and rank == 0 and wandb.run is not None:
+            wandb.log({
+                'train/grip_loss': q_grip_loss.mean(),
+                'train/trans_loss': q_trans_loss.mean(),
+                'train/rot_loss': q_rot_loss.mean(),
+                'train/collision_loss': q_collision_loss.mean(),
+                'train/total_loss': total_loss,
+            }, step=step)
+        self._optimizer.zero_grad()
+        total_loss.backward()
+        self._optimizer.step()
+        self._summaries = {
+            "losses/total_loss": total_loss,
+            "losses/trans_loss": q_trans_loss.mean(),
+            "losses/rot_loss": q_rot_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/grip_loss": q_grip_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/collision_loss": q_collision_loss.mean()
+            if with_rot_and_grip
+            else 0.0,
+        }
+        self._wandb_summaries = {
+            'losses/total_loss': total_loss,
+            'losses/trans_loss': q_trans_loss.mean(),
+            'losses/rot_loss': q_rot_loss.mean() if with_rot_and_grip else 0.,
+            'losses/grip_loss': q_grip_loss.mean() if with_rot_and_grip else 0.,
+            'losses/collision_loss': q_collision_loss.mean() if with_rot_and_grip else 0.
+        }
+        if self._lr_scheduler:
+            self._scheduler.step()
+            self._summaries["learning_rate"] = self._scheduler.get_last_lr()[0]
+        self._vis_voxel_grid = voxel_grid[0]
+        self._vis_translation_qvalue = self._softmax_q_trans(q_trans[0])
+        self._vis_max_coordinate = coords[0]
+        self._vis_gt_coordinate = action_trans[0]
+        # Note: PerAct doesn't use multi-layer voxel grids like C2FARM
+        # stack prev_layer_voxel_grid(s) from previous layers into a list
+        if prev_layer_voxel_grid is None:
+            prev_layer_voxel_grid = [voxel_grid]
+        else:
+            prev_layer_voxel_grid = prev_layer_voxel_grid + [voxel_grid]
+        # stack prev_layer_bound(s) from previous layers into a list
+        if prev_layer_bounds is None:
+            prev_layer_bounds = [self._coordinate_bounds.repeat(bs, 1)]
+        else:
+            prev_layer_bounds = prev_layer_bounds + [bounds]
+        q_trans_vis=True
+        log_freq = getattr(getattr(getattr(self, "cfg", None), "framework", None), "log_freq", None)
+        if log_freq and step % log_freq == 0 and rank == 0:
+            print(f"{arm}_arm_predict: {self._vis_max_coordinate}")
+            print(f"{arm}_gt: {self._vis_gt_coordinate}")
+            rendered_img = visualise_voxel(
+                voxel_grid[0].cpu().detach().numpy(),    # [10, 100, 100, 100]
+                self._vis_translation_qvalue.detach().cpu().numpy() if q_trans_vis else None,
+                self._vis_max_coordinate.detach().cpu().numpy(),
+                self._vis_gt_coordinate.detach().cpu().numpy(),
+                voxel_size=0.045,
+                # voxel_size=0.1,   # more focus ??
+                rotation_amount=np.deg2rad(-90),
+                highlight_alpha=1.0,
+                alpha=0.4,
+            )
+            os.makedirs('recon', exist_ok=True)
+            # plot three images in one row with subplots:
+            rgb_src = obs[0][0][0].squeeze(0).permute(1, 2, 0)  / 2 + 0.5
+            fig, axs = plt.subplots(1, 4, figsize=(9, 3))
+            # src
+            axs[0].imshow(rgb_src.cpu().numpy())
+            axs[0].title.set_text('src')
+            axs[1].imshow(rendered_img)
+            axs[1].text(0, 40, 'predicted', color='blue')
+            axs[1].text(0, 80, 'gt', color='red')
+            for ax in axs:
+                ax.axis('off')
+            plt.tight_layout()
+            if rank == 0:
+                if wandb.run is not None:
+                    buf = io.BytesIO()
+                    plt.savefig(buf, format='png')
+                    buf.seek(0)
+                    image = Image.open(buf)
+                    wandb.log({"eval/recon_img": wandb.Image(image)}, step=step)
+                    buf.close()
+                    cprint(f'Saved to wandb', 'cyan')
+                else:
+                    plt.savefig(f'recon/{step}_rgb.png')
+                    workdir = os.getcwd()
+                    cprint(f'Saved {workdir}/recon/{step}_rgb.png locally', 'cyan')
+        return {
+            "total_loss": total_loss,
+            "prev_layer_voxel_grid": prev_layer_voxel_grid,
+            "prev_layer_bounds": prev_layer_bounds,
+        }
+    def update_wandb_summaries(self):
+        summaries = dict()
+        for k, v in self._wandb_summaries.items():
+            summaries[k] = v
+        return summaries
+    def act(self, step: int, observation: dict, deterministic=False) -> ActResult:
+        deterministic = True
+        bounds = self._coordinate_bounds
+        prev_layer_voxel_grid = observation.get("prev_layer_voxel_grid", None)
+        prev_layer_bounds = observation.get("prev_layer_bounds", None)
+        lang_goal_tokens = observation.get("lang_goal_tokens", None).long()
+        # extract CLIP language embs
+        with torch.no_grad():
+            lang_goal_tokens = lang_goal_tokens.to(device=self._device)
+            (
+                lang_goal_emb,
+                lang_token_embs,
+            ) = self._clip_rn50.encode_text_with_embeddings(lang_goal_tokens[0])
+        # voxelization resolution
+        res = (bounds[:, 3:] - bounds[:, :3]) / self._voxel_size
+        max_rot_index = int(360 // self._rotation_resolution)
+        proprio = None
+        if self._include_low_dim_state:
+            proprio = observation["low_dim_state"]
+            proprio = proprio[0].to(self._device)
+        obs, pcd = self._act_preprocess_inputs(observation)
+        # correct batch size and device
+        obs = [[o[0][0].to(self._device), o[1][0].to(self._device)] for o in obs]
+        pcd = [p[0].to(self._device) for p in pcd]
+        lang_goal_emb = lang_goal_emb.to(self._device)
+        lang_token_embs = lang_token_embs.to(self._device)
+        bounds = torch.as_tensor(bounds, device=self._device)
+        prev_layer_voxel_grid = (
+            prev_layer_voxel_grid.to(self._device)
+            if prev_layer_voxel_grid is not None
+            else None
+        )
+        prev_layer_bounds = (
+            prev_layer_bounds.to(self._device)
+            if prev_layer_bounds is not None
+            else None
+        )
+        # inference
+        q_trans, q_rot_grip, q_ignore_collisions, vox_grid = self._q(
+            obs,
+            proprio,
+            pcd,
+            lang_goal_emb,
+            lang_token_embs,
+            bounds,
+            prev_layer_bounds,
+            prev_layer_voxel_grid,
+        )
+        # softmax Q predictions
+        q_trans = self._softmax_q_trans(q_trans)
+        q_rot_grip = (
+            self._softmax_q_rot_grip(q_rot_grip)
+            if q_rot_grip is not None
+            else q_rot_grip
+        )
+        q_ignore_collisions = (
+            self._softmax_ignore_collision(q_ignore_collisions)
+            if q_ignore_collisions is not None
+            else q_ignore_collisions
+        )
+        # argmax Q predictions
+        (
+            coords,
+            rot_and_grip_indicies,
+            ignore_collisions,
+        ) = self._q.choose_highest_action(q_trans, q_rot_grip, q_ignore_collisions)
+        rot_grip_action = rot_and_grip_indicies if q_rot_grip is not None else None
+        ignore_collisions_action = (
+            ignore_collisions.int() if ignore_collisions is not None else None
+        )
+        coords = coords.int()
+        attention_coordinate = bounds[:, :3] + res * coords + res / 2
+        # stack prev_layer_voxel_grid(s) into a list
+        # NOTE: PerAct doesn't used multi-layer voxel grids like C2FARM
+        if prev_layer_voxel_grid is None:
+            prev_layer_voxel_grid = [vox_grid]
+        else:
+            prev_layer_voxel_grid = prev_layer_voxel_grid + [vox_grid]
+        if prev_layer_bounds is None:
+            prev_layer_bounds = [bounds]
+        else:
+            prev_layer_bounds = prev_layer_bounds + [bounds]
+        observation_elements = {
+            "attention_coordinate": attention_coordinate,
+            "prev_layer_voxel_grid": prev_layer_voxel_grid,
+            "prev_layer_bounds": prev_layer_bounds,
+        }
+        info = {
+            "voxel_grid_depth%d" % self._layer: vox_grid,
+            "q_depth%d" % self._layer: q_trans,
+            "voxel_idx_depth%d" % self._layer: coords,
+        }
+        self._act_voxel_grid = vox_grid[0]
+        self._act_max_coordinate = coords[0]
+        self._act_qvalues = q_trans[0].detach()
+        return ActResult(
+            (coords, rot_grip_action, ignore_collisions_action),
+            observation_elements=observation_elements,
+            info=info,
+        )
+    def update_summaries(self) -> List[Summary]:
+        summaries = [
+            ImageSummary(
+                "%s/update_qattention" % self._name,
+                transforms.ToTensor()(
+                    visualise_voxel(
+                        self._vis_voxel_grid.detach().cpu().numpy(),
+                        self._vis_translation_qvalue.detach().cpu().numpy(),
+                        self._vis_max_coordinate.detach().cpu().numpy(),
+                        self._vis_gt_coordinate.detach().cpu().numpy(),
+                    )
+                ),
+            )
+        ]
+        for n, v in self._summaries.items():
+            summaries.append(ScalarSummary("%s/%s" % (self._name, n), v))
+        for name, crop in self._crop_summary:
+            crops = (torch.cat(torch.split(crop, 3, dim=1), dim=3) + 1.0) / 2.0
+            summaries.extend([ImageSummary("%s/crops/%s" % (self._name, name), crops)])
+        for tag, param in self._q.named_parameters():
+            # assert not torch.isnan(param.grad.abs() <= 1.0).all()
+            summaries.append(
+                HistogramSummary("%s/gradient/%s" % (self._name, tag), param.grad)
+            )
+            summaries.append(
+                HistogramSummary("%s/weight/%s" % (self._name, tag), param.data)
+            )
+        return summaries
+    def act_summaries(self) -> List[Summary]:
+        return [
+            ImageSummary(
+                "%s/act_Qattention" % self._name,
+                transforms.ToTensor()(
+                    visualise_voxel(
+                        self._act_voxel_grid.cpu().numpy(),
+                        self._act_qvalues.cpu().numpy(),
+                        self._act_max_coordinate.cpu().numpy(),
+                    )
+                ),
+            )
+        ]
+    def concat_weights(self, param, target_size, dims=-1):
+        if param.size(-1) < target_size:
+            param = torch.cat([param, param], dims)
+        return param
+    def load_weights(self, savedir: str):
+        device = (
+            self._device
+            if not self._training
+            else torch.device("cuda:%d" % self._device)
+        )
+        weight_file = os.path.join(savedir, "%s.pt" % self._name)
+        state_dict = torch.load(weight_file, map_location=device)
+        # load only keys that are in the current model
+        merged_state_dict = self._q.state_dict()
+        if not self._training:
+            for k, v in state_dict.items():
+                if not self._training:
+                    k = k.replace("_qnet.module", "_qnet")
+                if k in merged_state_dict:
+                    merged_state_dict[k] = v
+                else:
+                    if "_voxelizer" not in k:
+                        logging.warning("key %s not found in checkpoint" % k)
+        else:
+            for k, v in state_dict.items():
+                if not self._training:
+                    k = k.replace("_qnet.module", "_qnet")
+                elif k == "_qnet.module.pos_encoding":
+                    if (v.shape[1] != 8077 or v.shape[1] != 8154) and v.shape[1] < 154:
+                        if self.anybimanual:
+                            lang_max_seq_len = 154
+                        else:
+                            lang_max_seq_len = 77
+                        spatial_size = v.shape[1]
+                        input_dim_before_seq = v.shape[-1]
+                        flattened_v = v.view(1, -1, input_dim_before_seq)  # (1, spatial_size**3, self.input_dim_before_seq)
+                        new_pos_encoding = torch.randn(1, lang_max_seq_len, input_dim_before_seq, device=device)
+                        merged_pos_encoding = torch.cat([flattened_v, new_pos_encoding], dim=1)  # (1, lang_max_seq_len + spatial_size**3, self.input_dim_before_seq)
+                        merged_state_dict["_qnet.module.pos_encoding"] = merged_pos_encoding
+                    else:
+                        merged_state_dict["_qnet.module.pos_encoding"] = v
+                elif k.startswith("_qnet.module.cross_attend_blocks"):
+                    if self.anybimanual:
+                        if v.size(-1) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256)
+                elif k.startswith("_qnet.module.decoder_cross_attn"):
+                    if self.anybimanual:
+                        if v.size(0) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256, 0)
+                            merged_state_dict[k] = self.concat_weights(v, 256, 0)
+                        if v.size(-1) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256)
+                            merged_state_dict[k] = self.concat_weights(v, 256)
+                elif k == "_qnet.module.up0.conv_up.0.conv3d.weight":
+                    if self.anybimanual:
+                        if v.size(1) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256, 1)
+                elif k.startswith("_qnet.module.dense0"):
+                    if self.anybimanual:
+                        if v.size(-1) == 1024:
+                            merged_state_dict[k] = torch.cat([v, v[:, :512]], dim=-1)
+                elif k in merged_state_dict:
+                    merged_state_dict[k] = v
+                else:
+                    if "_voxelizer" not in k:
+                        logging.warning("key %s not found in checkpoint" % k)
+        if not self._training:
+            # reshape voxelizer weights
+            b = merged_state_dict["_voxelizer._ones_max_coords"].shape[0]
+            merged_state_dict["_voxelizer._ones_max_coords"] = merged_state_dict[
+                "_voxelizer._ones_max_coords"
+            ][0:1]
+            flat_shape = merged_state_dict["_voxelizer._flat_output"].shape[0]
+            merged_state_dict["_voxelizer._flat_output"] = merged_state_dict[
+                "_voxelizer._flat_output"
+            ][0 : flat_shape // b]
+            merged_state_dict["_voxelizer._tiled_batch_indices"] = merged_state_dict[
+                "_voxelizer._tiled_batch_indices"
+            ][0:1]
+            merged_state_dict["_voxelizer._index_grid"] = merged_state_dict[
+                "_voxelizer._index_grid"
+            ][0:1]
+        self._q.load_state_dict(merged_state_dict)
+        print("loaded weights from %s" % weight_file)
+    def save_weights(self, savedir: str):
+        torch.save(self._q.state_dict(), os.path.join(savedir, "%s.pt" % self._name))

third_party/AnyBimanual/agents/peract_bc/qattention_stack_agent.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import List
+import torch
+from yarr.agents.agent import Agent, ActResult, Summary
+import numpy as np
+from helpers import utils
+from agents.peract_bc.qattention_peract_bc_agent import QAttentionPerActBCAgent
+NAME = "QAttentionStackAgent"
+class QAttentionStackAgent(Agent):
+    def __init__(
+        self,
+        qattention_agents: List[QAttentionPerActBCAgent],
+        rotation_resolution: float,
+        camera_names: List[str],
+        rotation_prediction_depth: int = 0,
+    ):
+        super(QAttentionStackAgent, self).__init__()
+        self._qattention_agents = qattention_agents
+        self._rotation_resolution = rotation_resolution
+        self._camera_names = camera_names
+        self._rotation_prediction_depth = rotation_prediction_depth
+    def build(self, training: bool, device=None) -> None:
+        self._device = device
+        if self._device is None:
+            self._device = torch.device("cpu")
+        for qa in self._qattention_agents:
+            qa.build(training, device)
+    def update(self, step: int, replay_sample: dict) -> dict:
+        priorities = 0
+        total_losses = 0.0
+        for qa in self._qattention_agents:
+            update_dict = qa.update(step, replay_sample)
+            replay_sample.update(update_dict)
+            total_losses += update_dict["total_loss"]
+        return {
+            "total_losses": total_losses,
+        }
+    def act(self, step: int, observation: dict, deterministic=False) -> ActResult:
+        observation_elements = {}
+        translation_results, rot_grip_results, ignore_collisions_results = [], [], []
+        infos = {}
+        for depth, qagent in enumerate(self._qattention_agents):
+            act_results = qagent.act(step, observation, deterministic)
+            attention_coordinate = (
+                act_results.observation_elements["attention_coordinate"].cpu().numpy()
+            )
+            observation_elements[
+                "attention_coordinate_layer_%d" % depth
+            ] = attention_coordinate[0]
+            translation_idxs, rot_grip_idxs, ignore_collisions_idxs = act_results.action
+            translation_results.append(translation_idxs)
+            if rot_grip_idxs is not None:
+                rot_grip_results.append(rot_grip_idxs)
+            if ignore_collisions_idxs is not None:
+                ignore_collisions_results.append(ignore_collisions_idxs)
+            observation["attention_coordinate"] = act_results.observation_elements[
+                "attention_coordinate"
+            ]
+            observation["prev_layer_voxel_grid"] = act_results.observation_elements[
+                "prev_layer_voxel_grid"
+            ]
+            observation["prev_layer_bounds"] = act_results.observation_elements[
+                "prev_layer_bounds"
+            ]
+            for n in self._camera_names:
+                px, py = utils.point_to_pixel_index(
+                    attention_coordinate[0],
+                    observation["%s_camera_extrinsics" % n][0, 0].cpu().numpy(),
+                    observation["%s_camera_intrinsics" % n][0, 0].cpu().numpy(),
+                )
+                pc_t = torch.tensor(
+                    [[[py, px]]], dtype=torch.float32, device=self._device
+                )
+                observation["%s_pixel_coord" % n] = pc_t
+                observation_elements["%s_pixel_coord" % n] = [py, px]
+            infos.update(act_results.info)
+        rgai = torch.cat(rot_grip_results, 1)[0].cpu().numpy()
+        ignore_collisions = float(
+            torch.cat(ignore_collisions_results, 1)[0].cpu().numpy()
+        )
+        observation_elements["trans_action_indicies"] = (
+            torch.cat(translation_results, 1)[0].cpu().numpy()
+        )
+        observation_elements["rot_grip_action_indicies"] = rgai
+        continuous_action = np.concatenate(
+            [
+                act_results.observation_elements["attention_coordinate"]
+                .cpu()
+                .numpy()[0],
+                utils.discrete_euler_to_quaternion(
+                    rgai[-4:-1], self._rotation_resolution
+                ),
+                rgai[-1:],
+                [ignore_collisions],
+            ]
+        )
+        return ActResult(
+            continuous_action, observation_elements=observation_elements, info=infos
+        )
+    def update_summaries(self) -> List[Summary]:
+        summaries = []
+        for qa in self._qattention_agents:
+            summaries.extend(qa.update_summaries())
+        return summaries
+    def act_summaries(self) -> List[Summary]:
+        s = []
+        for qa in self._qattention_agents:
+            s.extend(qa.act_summaries())
+        return s
+    def load_weights(self, savedir: str):
+        for qa in self._qattention_agents:
+            qa.load_weights(savedir)
+    def save_weights(self, savedir: str):
+        for qa in self._qattention_agents:
+            qa.save_weights(savedir)

third_party/AnyBimanual/agents/peract_bc/skill_manager.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+import transformers
+from agents.peract_bimanual.trajectory_gpt2 import GPT2Model
+import torch.nn.functional as F
+class SkillManager(nn.Module):
+    def __init__(
+            self,
+            num_classes,
+            embedding_matrix=None,
+            voxel_dim=128,
+            lang_dim=128,
+            hidden_size=128,
+            output_dim=18,
+            max_voxels=8000,
+            max_lang_tokens=77,
+            **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.output_dim = output_dim
+        # GPT-2 configuration
+        config = transformers.GPT2Config(
+            vocab_size=1,  # not used
+            n_embd=hidden_size,
+            n_head=4,
+            n_ctx=1077,
+        )
+        self.max_voxels = max_voxels
+        self.max_lang_tokens = max_lang_tokens
+        self.embed_voxel = nn.Linear(voxel_dim, hidden_size)
+        self.embed_lang = nn.Linear(lang_dim, hidden_size)
+        self.transformer = GPT2Model(config)
+        self.embed_ln = nn.LayerNorm(hidden_size)
+        self.predict_logits = nn.Linear(hidden_size, output_dim)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.num_class = num_classes
+        if embedding_matrix is not None:
+            self.embeddings_matrix = embedding_matrix.to(self.device)
+    def forward(self, voxel_embedding, language_embedding):
+        batch_size = voxel_embedding.shape[0]
+        voxel_embeddings = self.embed_voxel(voxel_embedding)  # [b, 8000, hidden_size]
+        language_embeddings = self.embed_lang(language_embedding)  # [b, 77, hidden_size]
+        voxel_embeddings = voxel_embeddings.permute(0, 2, 1)  # [b, hidden_size, 8000]
+        voxel_embeddings = F.avg_pool1d(voxel_embeddings, kernel_size=16, stride=16)  # [b, hidden_size, 1000]
+        voxel_embeddings = voxel_embeddings.permute(0, 2, 1)  # [b, 1000, hidden_size]
+        inputs = torch.cat([language_embeddings, voxel_embeddings], dim=1)  # [b, 8077, hidden_size]
+        stacked_inputs = self.embed_ln(inputs)
+        attention_mask = torch.ones(
+            (batch_size, self.max_lang_tokens + self.max_voxels),
+            device=voxel_embedding.device,
+            dtype=torch.long  # Ensure correct dtype
+        )
+        assert torch.isfinite(attention_mask).all(), "attention_mask contains NaN or Inf"
+        assert torch.all((attention_mask == 1)), "attention_mask contains values not equal to 1"
+        transformer_outputs = self.transformer(
+            inputs_embeds=stacked_inputs,
+            attention_mask=None,
+        )
+        hidden_state = transformer_outputs.last_hidden_state  # [b, 8077, hidden_size]
+        aggregated_hidden = hidden_state.mean(dim=1)  # [b, hidden_size]
+        logits = self.predict_logits(aggregated_hidden)  # [b, output_dim]
+        probs = F.softmax(logits, dim=1)
+        skill = torch.matmul(probs, self.embeddings_matrix.to(probs.device))
+        skill = skill.view(-1,77,512)
+        return skill

third_party/AnyBimanual/agents/peract_bc/trajectory_gpt2.py ADDED Viewed

	@@ -0,0 +1,775 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.modeling_utils import (
+    Conv1D,
+    PreTrainedModel,
+    SequenceSummary,
+    find_pruneable_heads_and_indices,
+    prune_conv1d_layer,
+)
+from transformers.utils import logging
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "GPT2Config"
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "distilgpt2",
+    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
+]
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
+        super().__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer(
+            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.is_cross_attention = is_cross_attention
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * n_state, nx)
+            self.q_attn = Conv1D(n_state, nx)
+        else:
+            self.c_attn = Conv1D(3 * n_state, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
+        )
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / (float(v.size(-1)) ** 0.5)
+        nd, ns = w.size(-2), w.size(-1)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            mask = self.bias[:, :, ns - nd: ns, :ns]
+            w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+        outputs = [torch.matmul(w, v)]
+        if output_attentions:
+            outputs.append(w)
+        return outputs
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def forward(
+            self,
+            hidden_states,
+            layer_past=None,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=False,
+            output_attentions=False,
+    ):
+        if encoder_hidden_states is not None:
+            assert hasattr(
+                self, "q_attn"
+            ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        else:
+            present = (None,)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
+        a = attn_outputs[0]
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super().__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+class AdapterMLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super().__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super().__init__()
+        hidden_size = config.n_embd
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = Attention(hidden_size, n_ctx, config, scale)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # self.adapter_ln = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        if config.add_cross_attention:
+            self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(inner_dim, config)
+        # self.adapter_mlp = AdapterMLP(512, config)  # ADAPTER
+    def forward(
+            self,
+            hidden_states,
+            layer_past=None,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=False,
+            output_attentions=False,
+    ):
+        attn_outputs = self.attn(
+            self.ln_1(hidden_states),
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + hidden_states
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+            cross_attn_outputs = self.crossattention(
+                self.ln_cross_attn(hidden_states),
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = hidden_states + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+        feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
+        # residual connection
+        hidden_states = hidden_states + feed_forward_hidden_states
+        # hidden_states = hidden_states + self.adapter_ln(self.adapter_mlp(hidden_states))
+        outputs = [hidden_states] + outputs
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+class GPT2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = GPT2Config
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+            # module.weight.data.fill_(.01)  # KL: Adapter change
+@dataclass
+class GPT2DoubleHeadsModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
+            Language modeling loss.
+        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
+            Multiple choice classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
+            batch_size, num_heads, sequence_length, embed_size_per_head)`).
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    mc_loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    mc_logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+GPT2_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
+            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
+            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
+            passed as ``input_ids``.
+            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
+            have their past given to this model should not be passed as ``input_ids`` as they have already been
+            computed.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+            If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
+            :obj:`past_key_values`).
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+PARALLELIZE_DOCSTRING = r"""
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+    Args:
+        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
+            following number of attention modules:
+                - gpt2: 12
+                - gpt2-medium: 24
+                - gpt2-large: 36
+                - gpt2-xl: 48
+    Example::
+            # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+            model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
+            device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+                          1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+                          2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+                          3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
+            model.parallelize(device_map)
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+    Example::
+        # On a 4 GPU machine with gpt2-large:
+        model = GPT2LMHeadModel.from_pretrained('gpt2-large')
+        device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
+                    1: [8, 9, 10, 11, 12, 13, 14, 15],
+                    2: [16, 17, 18, 19, 20, 21, 22, 23],
+                    3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
+        model.parallelize(device_map) # Splits the model across several devices
+        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+"""
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class GPT2Model(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        # self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.init_weights()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.use_layers = None
+    def set_layers(self, num_layers):
+        assert 1 <= num_layers <= len(self.h)
+        if num_layers is not None:
+            num_layers -= 1
+        self.use_layers = num_layers
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        self.wpe = self.wpe.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="gpt2",
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids=None,
+            past_key_values=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        # Attention mask.
+        if attention_mask is not None:
+            assert batch_size > 0, "batch_size has to be defined and > 0"
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        # position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds  # + position_embeds
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if self.use_layers is not None and i >= self.use_layers:
+                break
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = layer_past.to(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if getattr(self.config, "gradient_checkpointing", False):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # checkpointing only works with tuple returns, not with lists
+                        return tuple(output for output in module(*inputs, use_cache, output_attentions))
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    layer_past,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states, present = outputs[:2]
+            if use_cache is True:
+                presents = presents + (present,)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )

third_party/AnyBimanual/agents/peract_bc/visual_aligner.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class VisualAligner(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256, mask_dim=128):
+        super(VisualAligner, self).__init__()
+        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv_res1 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv_res2 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv2_right = nn.Conv1d(in_channels=hidden_dim, out_channels=mask_dim, kernel_size=3, padding=1)
+        self.conv2_left = nn.Conv1d(in_channels=hidden_dim, out_channels=mask_dim, kernel_size=3, padding=1)
+        self.activation = nn.ReLU()
+    def forward(self, ins):
+        ins = ins.transpose(1, 2)
+        features = self.activation(self.conv1(ins))
+        residual = features
+        features = self.activation(self.conv_res1(features))
+        features = self.conv_res2(features)
+        features = features + residual
+        mask_right = self.activation(self.conv2_right(features))
+        mask_left = self.activation(self.conv2_left(features))
+        mask_right = mask_right.transpose(1, 2)
+        mask_left = mask_left.transpose(1, 2)
+        ins = ins.transpose(1, 2)
+        masked_ins1 = ins * mask_right
+        masked_ins2 = ins * mask_left
+        return masked_ins1, masked_ins2

third_party/AnyBimanual/agents/peract_bimanual/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ import agents.peract_bimanual.launch_utils

third_party/AnyBimanual/agents/peract_bimanual/launch_utils.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Adapted from ARM
+# Source: https://github.com/stepjam/ARM
+# License: https://github.com/stepjam/ARM/LICENSE
+from helpers.preprocess_agent import PreprocessAgent
+from agents.peract_bimanual.perceiver_lang_io import PerceiverVoxelLangEncoder
+from agents.peract_bimanual.qattention_peract_bc_agent import QAttentionPerActBCAgent
+from agents.peract_bimanual.qattention_stack_agent import QAttentionStackAgent
+from agents.peract_bimanual.skill_manager import SkillManager
+from agents.peract_bimanual.visual_aligner import VisualAligner
+from omegaconf import DictConfig
+import pickle
+import torch
+import os
+def create_agent(cfg: DictConfig):
+    depth_0bounds = cfg.rlbench.scene_bounds
+    cam_resolution = cfg.rlbench.camera_resolution
+    num_rotation_classes = int(360.0 // cfg.method.rotation_resolution)
+    qattention_agents = []
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    pkl_path = os.path.join(current_dir, "../../lang_token.pkl")
+    pkl_path = os.path.abspath(pkl_path)
+    with open(pkl_path, "rb") as f:
+        embeddings_dict = pickle.load(f)
+    flattened_embeddings = []
+    for key in embeddings_dict.keys():
+        embedding = torch.tensor(embeddings_dict[key])
+        flattened_embedding = embedding.view(-1)
+        flattened_embeddings.append(flattened_embedding)
+    embeddings_matrix = torch.stack(flattened_embeddings)
+    skill_manager = SkillManager(num_classes=18,embedding_matrix=embeddings_matrix)
+    visual_aligner = VisualAligner()
+    for depth, vox_size in enumerate(cfg.method.voxel_sizes):
+        last = depth == len(cfg.method.voxel_sizes) - 1
+        if cfg.framework.use_skill:
+            perceiver_encoder = PerceiverVoxelLangEncoder(
+                depth=cfg.method.transformer_depth,
+                iterations=cfg.method.transformer_iterations,
+                voxel_size=vox_size,
+                initial_dim=3 + 3 + 1 + 3,
+                low_dim_size=cfg.method.low_dim_size,
+                layer=depth,
+                num_rotation_classes=num_rotation_classes if last else 0,
+                num_grip_classes=2 if last else 0,
+                num_collision_classes=2 if last else 0,
+                input_axis=3,
+                num_latents=cfg.method.num_latents,
+                latent_dim=cfg.method.latent_dim,
+                cross_heads=cfg.method.cross_heads,
+                latent_heads=cfg.method.latent_heads,
+                cross_dim_head=cfg.method.cross_dim_head,
+                latent_dim_head=cfg.method.latent_dim_head,
+                weight_tie_layers=False,
+                activation=cfg.method.activation,
+                pos_encoding_with_lang=cfg.method.pos_encoding_with_lang,
+                input_dropout=cfg.method.input_dropout,
+                attn_dropout=cfg.method.attn_dropout,
+                decoder_dropout=cfg.method.decoder_dropout,
+                lang_fusion_type=cfg.method.lang_fusion_type,
+                voxel_patch_size=cfg.method.voxel_patch_size,
+                voxel_patch_stride=cfg.method.voxel_patch_stride,
+                no_skip_connection=cfg.method.no_skip_connection,
+                no_perceiver=cfg.method.no_perceiver,
+                no_language=cfg.method.no_language,
+                final_dim=cfg.method.final_dim,
+                anybimanual=cfg.framework.anybimanual,
+                skill_manager = skill_manager,
+                visual_aligner = visual_aligner
+            )
+        qattention_agent = QAttentionPerActBCAgent(
+            layer=depth,
+            coordinate_bounds=depth_0bounds,
+            perceiver_encoder=perceiver_encoder,
+            camera_names=cfg.rlbench.cameras,
+            voxel_size=vox_size,
+            bounds_offset=cfg.method.bounds_offset[depth - 1] if depth > 0 else None,
+            image_crop_size=cfg.method.image_crop_size,
+            lr=cfg.method.lr,
+            training_iterations=cfg.framework.training_iterations,
+            lr_scheduler=cfg.method.lr_scheduler,
+            num_warmup_steps=cfg.method.num_warmup_steps,
+            trans_loss_weight=cfg.method.trans_loss_weight,
+            rot_loss_weight=cfg.method.rot_loss_weight,
+            grip_loss_weight=cfg.method.grip_loss_weight,
+            collision_loss_weight=cfg.method.collision_loss_weight,
+            include_low_dim_state=True,
+            image_resolution=cam_resolution,
+            batch_size=cfg.replay.batch_size,
+            voxel_feature_size=3,
+            lambda_weight_l2=cfg.method.lambda_weight_l2,
+            num_rotation_classes=num_rotation_classes,
+            rotation_resolution=cfg.method.rotation_resolution,
+            transform_augmentation=cfg.method.transform_augmentation.apply_se3,
+            transform_augmentation_xyz=cfg.method.transform_augmentation.aug_xyz,
+            transform_augmentation_rpy=cfg.method.transform_augmentation.aug_rpy,
+            transform_augmentation_rot_resolution=cfg.method.transform_augmentation.aug_rot_resolution,
+            optimizer_type=cfg.method.optimizer,
+            num_devices=cfg.ddp.num_devices,
+            anybimanual=cfg.framework.anybimanual,
+            load_exists_weights = cfg.framework.load_existing_weights,
+            frozen = cfg.framework.frozen,
+            cfg = cfg,
+            aug_type=cfg.framework.augmentation_type,
+        )
+        qattention_agents.append(qattention_agent)
+    rotation_agent = QAttentionStackAgent(
+        qattention_agents=qattention_agents,
+        rotation_resolution=cfg.method.rotation_resolution,
+        camera_names=cfg.rlbench.cameras,
+    )
+    preprocess_agent = PreprocessAgent(pose_agent=rotation_agent)
+    return preprocess_agent

third_party/AnyBimanual/agents/peract_bimanual/perceiver_lang_io.py ADDED Viewed

	@@ -0,0 +1,628 @@

+# Perceiver IO implementation adpated for manipulation
+# Source: https://github.com/lucidrains/perceiver-pytorch
+# License: https://github.com/lucidrains/perceiver-pytorch/blob/main/LICENSE
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops import repeat
+import numpy as np
+from perceiver_pytorch.perceiver_pytorch import cache_fn
+from perceiver_pytorch.perceiver_pytorch import PreNorm, FeedForward, Attention
+from helpers.network_utils import (
+    DenseBlock,
+    SpatialSoftmax3D,
+    Conv3DBlock,
+    Conv3DUpsampleBlock,
+)
+def symmetric_kl_divergence(left, right):
+    eps = 1e-2
+    left_prob = torch.clamp(F.log_softmax(left, dim=-1), min=-10, max=10)
+    right_prob = torch.clamp(F.log_softmax(right, dim=-1), min=-10, max=10)
+    kl_left_to_right = F.kl_div(left_prob, right_prob.exp(), reduction="batchmean")*eps
+    kl_right_to_left = F.kl_div(right_prob, left_prob.exp(), reduction="batchmean")*eps
+    symmetric_kl = -(kl_left_to_right + kl_right_to_left) / 2.0
+    return symmetric_kl
+def l1_norm(tensor):
+    return torch.sum(torch.abs(tensor)) + 1e-4 * torch.norm(tensor)
+def l2_1_norm(tensor):
+    l2_norm_per_skill = torch.norm(tensor, dim=-1)
+    return torch.sum(l2_norm_per_skill)
+torch.autograd.set_detect_anomaly(True)
+# PerceiverIO adapted for 6-DoF manipulation
+class PerceiverVoxelLangEncoder(nn.Module):
+    def __init__(
+        self,
+        depth,  # number of self-attention layers
+        iterations,  # number cross-attention iterations (PerceiverIO uses just 1)
+        voxel_size,  # N voxels per side (size: N*N*N)
+        initial_dim,  # 10 dimensions - dimension of the input sequence to be encoded
+        low_dim_size,  # 4 dimensions - proprioception: {gripper_open, left_finger, right_finger, timestep}
+        layer=0,
+        num_rotation_classes=72,  # 5 degree increments (5*72=360) for each of the 3-axis
+        num_grip_classes=2,  # open or not open
+        num_collision_classes=2,  # collisions allowed or not allowed
+        input_axis=3,  # 3D tensors have 3 axes
+        num_latents=512,  # number of latent vectors
+        im_channels=64,  # intermediate channel size
+        latent_dim=512,  # dimensions of latent vectors
+        cross_heads=1,  # number of cross-attention heads
+        latent_heads=8,  # number of latent heads
+        cross_dim_head=64,
+        latent_dim_head=64,
+        activation="relu",
+        weight_tie_layers=False,
+        pos_encoding_with_lang=True,
+        input_dropout=0.1,
+        attn_dropout=0.1,
+        decoder_dropout=0.0,
+        lang_fusion_type="seq",
+        voxel_patch_size=9,
+        voxel_patch_stride=8,
+        no_skip_connection=False,
+        no_perceiver=False,
+        no_language=False,
+        final_dim=64,
+        anybimanual=False,
+        skill_manager=None,
+        visual_aligner=None,
+    ):
+        super().__init__()
+        self.depth = depth
+        self.layer = layer
+        self.init_dim = int(initial_dim)
+        self.iterations = iterations
+        self.input_axis = input_axis
+        self.voxel_size = voxel_size
+        self.low_dim_size = low_dim_size
+        self.im_channels = im_channels
+        self.pos_encoding_with_lang = pos_encoding_with_lang
+        self.lang_fusion_type = lang_fusion_type
+        self.voxel_patch_size = voxel_patch_size
+        self.voxel_patch_stride = voxel_patch_stride
+        self.num_rotation_classes = num_rotation_classes
+        self.num_grip_classes = num_grip_classes
+        self.num_collision_classes = num_collision_classes
+        self.final_dim = final_dim
+        self.input_dropout = input_dropout
+        self.attn_dropout = attn_dropout
+        self.decoder_dropout = decoder_dropout
+        self.no_skip_connection = no_skip_connection
+        self.no_perceiver = no_perceiver
+        self.no_language = no_language
+        self.anybimanual = anybimanual
+        self.skill_manager = skill_manager
+        self.visual_aligner = visual_aligner
+        # patchified input dimensions
+        spatial_size = voxel_size // self.voxel_patch_stride  # 100/5 = 20
+        # 64 voxel features + 64 proprio features (+ 64 lang goal features if concattenated)
+        self.input_dim_before_seq = (
+            self.im_channels * 3
+            if self.lang_fusion_type == "concat"
+            else self.im_channels * 2
+        )
+        if self.anybimanual:
+            self.input_dim_before_seq_ = self.input_dim_before_seq*2
+        else:
+            self.input_dim_before_seq_ = self.input_dim_before_seq
+        # CLIP language feature dimensions
+        if self.anybimanual:
+            lang_feat_dim, lang_emb_dim, lang_max_seq_len = 1024, 512, 154
+        else:
+            lang_feat_dim, lang_emb_dim, lang_max_seq_len = 1024, 512, 77
+        # learnable positional encoding
+        # peract2 pos_encoding_with_lang = True / peract = Falses?
+        if self.pos_encoding_with_lang:
+            self.pos_encoding = nn.Parameter(
+                torch.randn(
+                    1, lang_max_seq_len + spatial_size**3, self.input_dim_before_seq
+                )
+            )
+        else:
+            # assert self.lang_fusion_type == 'concat', 'Only concat is supported for pos encoding without lang.'
+            self.pos_encoding = nn.Parameter(
+                torch.randn(
+                    1,
+                    spatial_size,
+                    spatial_size,
+                    spatial_size,
+                    self.input_dim_before_seq,
+                )
+            )
+        # voxel input preprocessing 1x1 conv encoder
+        self.input_preprocess = Conv3DBlock(
+            self.init_dim,
+            self.im_channels,
+            kernel_sizes=1,
+            strides=1,
+            norm=None,
+            activation=activation,
+        )
+        # patchify conv
+        self.patchify = Conv3DBlock(
+            self.input_preprocess.out_channels,
+            self.im_channels,
+            kernel_sizes=self.voxel_patch_size,
+            strides=self.voxel_patch_stride,
+            norm=None,
+            activation=activation,
+        )
+        # language preprocess
+        if self.lang_fusion_type == "concat":
+            self.lang_preprocess = nn.Linear(lang_feat_dim, self.im_channels)
+        elif self.lang_fusion_type == "seq":
+            self.lang_preprocess = nn.Linear(lang_emb_dim, self.im_channels * 2)
+        # proprioception
+        if self.low_dim_size > 0:
+            self.proprio_preprocess = DenseBlock(
+                self.low_dim_size,
+                self.im_channels,
+                norm=None,
+                activation=activation,
+            )
+        # pooling functions
+        self.local_maxp = nn.MaxPool3d(3, 2, padding=1)
+        self.global_maxp = nn.AdaptiveMaxPool3d(1)
+        # 1st 3D softmax
+        self.ss0 = SpatialSoftmax3D(
+            self.voxel_size, self.voxel_size, self.voxel_size, self.im_channels
+        )
+        flat_size = self.im_channels * 4
+        # latent vectors (that are randomly initialized)
+        self.latents = nn.Parameter(torch.randn(num_latents, latent_dim))
+        if self.anybimanual:
+            self.cross_attend_blocks = nn.ModuleList(
+            [
+                PreNorm(
+                    latent_dim,
+                    Attention(
+                        latent_dim,
+                        self.input_dim_before_seq_,
+                        heads=cross_heads,
+                        dim_head=cross_dim_head,
+                        dropout=input_dropout,
+                    ),
+                    context_dim=self.input_dim_before_seq_,
+                ),
+                PreNorm(
+                    latent_dim,
+                    Attention(
+                        latent_dim,
+                        self.input_dim_before_seq_,
+                        heads=cross_heads,
+                        dim_head=cross_dim_head,
+                        dropout=input_dropout,
+                    ),
+                    context_dim=self.input_dim_before_seq_,
+                ),
+                PreNorm(latent_dim, FeedForward(latent_dim)),
+                PreNorm(latent_dim, FeedForward(latent_dim)),
+            ]
+            )
+        else:
+        # encoder cross attention
+            self.cross_attend_blocks = nn.ModuleList(
+                [
+                    PreNorm(
+                        latent_dim,
+                        Attention(
+                            latent_dim,
+                            self.input_dim_before_seq_,
+                            heads=cross_heads,
+                            dim_head=cross_dim_head,
+                            dropout=input_dropout,
+                        ),
+                        context_dim=self.input_dim_before_seq_,
+                    ),
+                    PreNorm(latent_dim, FeedForward(latent_dim)),
+                    PreNorm(latent_dim, FeedForward(latent_dim)),
+                ]
+            )
+        get_latent_attn = lambda: PreNorm(
+            latent_dim,
+            Attention(
+                latent_dim,
+                heads=latent_heads,
+                dim_head=latent_dim_head,
+                dropout=attn_dropout,
+            ),
+        )
+        get_latent_ff = lambda: PreNorm(latent_dim, FeedForward(latent_dim))
+        get_latent_attn, get_latent_ff = map(cache_fn, (get_latent_attn, get_latent_ff))
+        # self attention layers
+        self.layers = nn.ModuleList([])
+        cache_args = {"_cache": weight_tie_layers}
+        for i in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [get_latent_attn(**cache_args), get_latent_ff(**cache_args),
+                     get_latent_attn(**cache_args), get_latent_ff(**cache_args)]
+                )
+            )
+        self.combined_latent_attn = get_latent_attn(**cache_args)
+        self.combined_latent_ff = get_latent_ff(**cache_args)
+        # decoder cross attention
+        self.decoder_cross_attn_right = PreNorm(
+            self.input_dim_before_seq_,
+            Attention(
+                self.input_dim_before_seq_,
+                latent_dim,
+                heads=cross_heads,
+                dim_head=cross_dim_head,
+                dropout=decoder_dropout,
+            ),
+            context_dim=latent_dim,
+        )
+        self.decoder_cross_attn_left = PreNorm(
+            self.input_dim_before_seq_,
+            Attention(
+                self.input_dim_before_seq_,
+                latent_dim,
+                heads=cross_heads,
+                dim_head=cross_dim_head,
+                dropout=decoder_dropout,
+            ),
+            context_dim=latent_dim,
+        )
+        # upsample conv
+        self.up0 = Conv3DUpsampleBlock(
+            self.input_dim_before_seq_,
+            self.final_dim,
+            kernel_sizes=self.voxel_patch_size,
+            strides=self.voxel_patch_stride,
+            norm=None,
+            activation=activation,
+        )
+        # 2nd 3D softmax
+        self.ss1 = SpatialSoftmax3D(
+            spatial_size, spatial_size, spatial_size, self.input_dim_before_seq_
+        )
+        flat_size += self.input_dim_before_seq_ * 4
+        # final 3D softmax
+        self.final = Conv3DBlock(
+            self.im_channels
+            if (self.no_perceiver or self.no_skip_connection)
+            else self.im_channels * 2,
+            self.im_channels,
+            kernel_sizes=3,
+            strides=1,
+            norm=None,
+            activation=activation,
+        )
+        self.right_trans_decoder = Conv3DBlock(
+            self.final_dim,
+            1,
+            kernel_sizes=3,
+            strides=1,
+            norm=None,
+            activation=None,
+        )
+        self.left_trans_decoder = Conv3DBlock(
+            self.final_dim,
+            1,
+            kernel_sizes=3,
+            strides=1,
+            norm=None,
+            activation=None,
+        )
+        # rotation, gripper, and collision MLP layers
+        if self.num_rotation_classes > 0:
+            self.ss_final = SpatialSoftmax3D(
+                self.voxel_size, self.voxel_size, self.voxel_size, self.im_channels
+            )
+            flat_size += self.im_channels * 4
+            self.right_dense0 = DenseBlock(flat_size, 256, None, activation)
+            self.right_dense1 = DenseBlock(256, self.final_dim, None, activation)
+            self.left_dense0 = DenseBlock(flat_size, 256, None, activation)
+            self.left_dense1 = DenseBlock(256, self.final_dim, None, activation)
+            self.right_rot_grip_collision_ff = DenseBlock(
+                self.final_dim,
+                self.num_rotation_classes * 3
+                + self.num_grip_classes
+                + self.num_collision_classes,
+                None,
+                None,
+            )
+            self.left_rot_grip_collision_ff = DenseBlock(
+                self.final_dim,
+                self.num_rotation_classes * 3
+                + self.num_grip_classes
+                + self.num_collision_classes,
+                None,
+                None,
+            )
+    def encode_text(self, x):
+        with torch.no_grad():
+            text_feat, text_emb = self._clip_rn50.encode_text_with_embeddings(x)
+        text_feat = text_feat.detach()
+        text_emb = text_emb.detach()
+        text_mask = torch.where(x == 0, x, 1)  # [1, max_token_len]
+        return text_feat, text_emb
+    def forward(
+        self,
+        ins,
+        proprio,
+        lang_goal_emb,
+        lang_token_embs,
+        prev_layer_voxel_grid,
+        bounds,
+        prev_layer_bounds,
+        mask=None,
+    ):
+        # preprocess input
+        ins_numpy = str(ins.cpu().numpy())
+        d0 = self.input_preprocess(ins)  # [B,10,100,100,100] -> [B,64,100,100,100]
+        # aggregated features from 1st softmax and maxpool for MLP decoders
+        feats = [self.ss0(d0.contiguous()), self.global_maxp(d0).view(ins.shape[0], -1)]
+        # patchify input (5x5x5 patches)
+        ins = self.patchify(d0)  # [B,64,100,100,100] -> [B,64,20,20,20]
+        b, c, d, h, w, device = *ins.shape, ins.device
+        axis = [d, h, w]
+        assert (
+            len(axis) == self.input_axis
+        ), "input must have the same number of axis as input_axis"
+        # concat proprio
+        if self.low_dim_size > 0:
+            p = self.proprio_preprocess(proprio)  # [B,8] -> [B,64]
+            p = p.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, 1, d, h, w)
+            ins = torch.cat([ins, p], dim=1)  # [B,128,20,20,20]
+        # language ablation
+        if self.no_language:
+            lang_goal_emb = torch.zeros_like(lang_goal_emb)
+            lang_token_embs = torch.zeros_like(lang_token_embs)
+        # option 1: tile and concat lang goal to input
+        if self.lang_fusion_type == "concat":
+            lang_emb = lang_goal_emb
+            lang_emb = lang_emb.to(dtype=ins.dtype)
+            l = self.lang_preprocess(lang_emb)
+            l = l.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, 1, d, h, w)
+            ins = torch.cat([ins, l], dim=1)
+        # channel last
+        ins = rearrange(ins, "b d ... -> b ... d")  # [B,20,20,20,128]
+        # add pos encoding to grid
+        if not self.pos_encoding_with_lang:
+            ins = ins + self.pos_encoding
+        ######################## NOTE #############################
+        # NOTE: If you add positional encodings ^here the lang embs
+        # won't have positional encodings. I accidently forgot
+        # to turn this off for all the experiments in the paper.
+        # So I guess those models were using language embs
+        # as a bag of words :( But it doesn't matter much for
+        # RLBench tasks since we don't test for novel instructions
+        # at test time anyway. The recommend way is to add
+        # positional encodings to the final input sequence
+        # fed into the Perceiver Transformer, as done below
+        # (and also in the Colab tutorial).
+        ###########################################################
+        # concat to channels of and flatten axis
+        queries_orig_shape = ins.shape
+        # rearrange input to be channel last
+        ins = rearrange(ins, "b ... d -> b (...) d")  # [B,8000,128]
+        ins_wo_prev_layers = ins
+        # option 2: add lang token embs as a sequence
+        if self.anybimanual:
+            if self.lang_fusion_type == "seq":
+                l = self.lang_preprocess(lang_token_embs)  # [B,77,512] -> [B,77,128]
+                mask_right, mask_left = self.visual_aligner(ins)
+                L_voxel = symmetric_kl_divergence(mask_left, mask_right)
+                right_skill = self.skill_manager(mask_right, l)
+                left_skill = self.skill_manager(mask_left, l)
+                right_skill = self.lang_preprocess(right_skill)
+                left_skill = self.lang_preprocess(left_skill)
+                L_skill = (
+                    l1_norm(left_skill) + l1_norm(right_skill) +
+                    0.01 * (l2_1_norm(left_skill) + l2_1_norm(right_skill))
+                )
+                l_right = torch.cat((right_skill, l), dim=1)
+                ins_right = torch.cat((l_right, mask_right), dim=1)
+                l_left = torch.cat((left_skill, l), dim=1)
+                ins_left = torch.cat((l_left, mask_left), dim=1)
+            if self.pos_encoding_with_lang:
+                ins_right = ins_right + self.pos_encoding
+                ins_left = ins_left + self.pos_encoding
+        else:
+            if self.lang_fusion_type == "seq":
+                # print(lang_token_embs.requires_grad) # False
+                l = self.lang_preprocess(lang_token_embs)  # [B,77,512] -> [B,77,128]
+                # print(l.requires_grad) # True
+                ins = torch.cat((l, ins), dim=1)
+            # add pos encoding to language + flattened grid (the recommended way)
+            if self.pos_encoding_with_lang:
+                ins = ins + self.pos_encoding
+        # batchify latents
+        if self.anybimanual:
+            x = repeat(self.latents, "n d -> b n d", b=b)
+            cross_attn_right, cross_attn_left, cross_ff_right, cross_ff_left = self.cross_attend_blocks
+        else:
+            x = repeat(self.latents, "n d -> b n d", b=b)
+            cross_attn, cross_ff_right, cross_ff_left = self.cross_attend_blocks
+        if self.anybimanual:
+            ins_r = torch.cat((l_right, ins),dim=1)
+            ins_l = torch.cat((l_left, ins), dim=1)
+            ins_right = torch.cat((ins_right, ins_r), dim=2)
+            ins_left = torch.cat((ins_left, ins_l), dim=2)
+        for it in range(self.iterations):
+            # encoder cross attention
+            if self.anybimanual:
+                x_r, x_l = x.chunk(2, dim=1)
+                x_right = cross_attn_right(x_r, context=ins_right, mask=mask) + x_r
+                x_left = cross_attn_left(x_l, context=ins_left, mask=mask) + x_l
+            else:
+                x = cross_attn(x, context=ins, mask=mask) + x
+                x_right, x_left = x.chunk(2, dim=1)
+            x_right = cross_ff_right(x_right) + x_right
+            x_left = cross_ff_left(x_left) + x_left
+            # self-attention layers
+            for self_attn_right, self_ff_right, self_attn_left, self_ff_left in self.layers:
+                x_right = self_attn_right(x_right) + x_right
+                x_right = self_ff_right(x_right) + x_right
+                x_left = self_attn_left(x_left) + x_left
+                x_left = self_ff_left(x_left) + x_left
+            x = torch.concat([x_right, x_left], dim=1)
+            x = self.combined_latent_attn(x) + x
+            x = self.combined_latent_ff(x) + x
+        x_right, x_left = x.chunk(2, dim=1)
+        # decoder cross attention
+        if self.anybimanual:
+            latents_right = self.decoder_cross_attn_right(ins_right, context=x_right)
+            latents_left = self.decoder_cross_attn_left(ins_left, context=x_left)
+            if self.lang_fusion_type == "seq":
+                latents_right = latents_right[:, l_right.shape[1] :]
+                latents_left = latents_left[:, l_left.shape[1] :]
+        else:
+            latents_right = self.decoder_cross_attn_right(ins, context=x_right)
+            latents_left = self.decoder_cross_attn_left(ins, context=x_left)
+            if self.lang_fusion_type == "seq":
+                latents_right = latents_right[:, l.shape[1] :]
+                latents_left = latents_left[:, l.shape[1] :]
+        # crop out the language part of the output sequence
+        # reshape back to voxel grid
+        latents_right = latents_right.view(
+            b, *queries_orig_shape[1:-1], latents_right.shape[-1]
+        )  # [B,20,20,20,64]
+        latents_right = rearrange(latents_right, "b ... d -> b d ...")  # [B,64,20,20,20]
+        # reshape back to voxel grid
+        latents_left = latents_left.view(
+            b, *queries_orig_shape[1:-1], latents_left.shape[-1]
+        )  # [B,20,20,20,64]
+        latents_left = rearrange(latents_left, "b ... d -> b d ...")  # [B,64,20,20,20]
+        # aggregated features from 2nd softmax and maxpool for MLP decoders
+        feats_right = feats.copy()
+        feats_left = feats
+        feats_right.extend(
+            [self.ss1(latents_right.contiguous()), self.global_maxp(latents_right).view(b, -1)]
+        )
+        feats_left.extend(
+            [self.ss1(latents_left.contiguous()), self.global_maxp(latents_left).view(b, -1)]
+        )
+        # upsample
+        u0_right = self.up0(latents_right)
+        u0_left = self.up0(latents_left)
+        # ablations
+        if self.no_skip_connection:
+            u_right = self.final(u0_right)
+            u_left = self.final(u0_left)
+        elif self.no_perceiver:
+            u_right = self.final(d0)
+            u_left = self.final(d0)
+        else:
+            u_right = self.final(torch.cat([d0, u0_right], dim=1))
+            u_left = self.final(torch.cat([d0, u0_left], dim=1))
+        # translation decoder
+        right_trans = self.right_trans_decoder(u_right)
+        left_trans = self.left_trans_decoder(u_left)
+        # rotation, gripper, and collision MLPs
+        rot_and_grip_out = None
+        if self.num_rotation_classes > 0:
+            feats_right.extend(
+                [self.ss_final(u_right.contiguous()), self.global_maxp(u_right).view(b, -1)]
+            )
+            right_dense0 = self.right_dense0(torch.cat(feats_right, dim=1))
+            right_dense1 = self.right_dense1(right_dense0)  # [B,72*3+2+2]
+            right_rot_and_grip_collision_out = self.right_rot_grip_collision_ff(
+                right_dense1
+            )
+            right_rot_and_grip_out = right_rot_and_grip_collision_out[
+                :, : -self.num_collision_classes
+            ]
+            right_collision_out = right_rot_and_grip_collision_out[
+                :, -self.num_collision_classes :
+            ]
+            feats_left.extend(
+                [self.ss_final(u_left.contiguous()), self.global_maxp(u_left).view(b, -1)]
+            )
+            left_dense0 = self.left_dense0(torch.cat(feats_left, dim=1))
+            left_dense1 = self.left_dense1(left_dense0)  # [B,72*3+2+2]
+            left_rot_and_grip_collision_out = self.left_rot_grip_collision_ff(
+                left_dense1
+            )
+            left_rot_and_grip_out = left_rot_and_grip_collision_out[
+                :, : -self.num_collision_classes
+            ]
+            left_collision_out = left_rot_and_grip_collision_out[
+                :, -self.num_collision_classes :
+            ]
+        if not self.anybimanual:
+            L_skill = 0
+            L_voxel = 0
+        return (
+            right_trans,
+            right_rot_and_grip_out,
+            right_collision_out,
+            left_trans,
+            left_rot_and_grip_out,
+            left_collision_out
+        ), L_skill,L_voxel,

third_party/AnyBimanual/agents/peract_bimanual/qattention_peract_bc_agent.py ADDED Viewed

	@@ -0,0 +1,1317 @@

+import copy
+import logging
+import os
+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from pytorch3d import transforms as torch3d_tf
+from yarr.agents.agent import (
+    Agent,
+    ActResult,
+    ScalarSummary,
+    HistogramSummary,
+    ImageSummary,
+    Summary,
+)
+import io
+import PIL.Image as Image
+import matplotlib.pyplot as plt
+from helpers import utils
+from helpers.utils import visualise_voxel, stack_on_channel
+from voxel import augmentation_ab
+from voxel.voxel_grid import VoxelGrid
+from voxel.augmentation import apply_se3_augmentation
+from einops import rearrange
+from helpers.clip.core.clip import build_model, load_clip
+import transformers
+from helpers.optim.lamb import Lamb
+import wandb
+from termcolor import colored, cprint
+from torch.nn.parallel import DistributedDataParallel as DDP
+NAME = "QAttentionAgent"
+import plotly.graph_objects as go
+class QFunction(nn.Module):
+    def __init__(
+        self,
+        perceiver_encoder: nn.Module,
+        voxelizer: VoxelGrid,
+        bounds_offset: float,
+        rotation_resolution: float,
+        device,
+        training,
+    ):
+        super(QFunction, self).__init__()
+        self._rotation_resolution = rotation_resolution
+        self._voxelizer = voxelizer
+        self._bounds_offset = bounds_offset
+        self._qnet = perceiver_encoder.to(device)
+        # distributed training
+        if training:
+            self._qnet = DDP(self._qnet, device_ids=[device], find_unused_parameters=True)
+    def _argmax_3d(self, tensor_orig):
+        b, c, d, h, w = tensor_orig.shape  # c will be one
+        idxs = tensor_orig.view(b, c, -1).argmax(-1)
+        indices = torch.cat([((idxs // h) // d), (idxs // h) % w, idxs % w], 1)
+        return indices
+    def choose_highest_action(self, q_trans, q_rot_grip, q_collision):
+        coords = self._argmax_3d(q_trans)
+        rot_and_grip_indicies = None
+        ignore_collision = None
+        if q_rot_grip is not None:
+            q_rot = torch.stack(
+                torch.split(
+                    q_rot_grip[:, :-2], int(360 // self._rotation_resolution), dim=1
+                ),
+                dim=1,
+            )
+            rot_and_grip_indicies = torch.cat(
+                [
+                    q_rot[:, 0:1].argmax(-1),
+                    q_rot[:, 1:2].argmax(-1),
+                    q_rot[:, 2:3].argmax(-1),
+                    q_rot_grip[:, -2:].argmax(-1, keepdim=True),
+                ],
+                -1,
+            )
+            ignore_collision = q_collision[:, -2:].argmax(-1, keepdim=True)
+        return coords, rot_and_grip_indicies, ignore_collision
+    def forward(
+        self,
+        rgb_pcd,
+        proprio,
+        pcd,
+        lang_goal_emb,
+        lang_token_embs,
+        bounds=None,
+        prev_bounds=None,
+        prev_layer_voxel_grid=None,
+    ):
+        # rgb_pcd will be list of list (list of [rgb, pcd])
+        b = rgb_pcd[0][0].shape[0]
+        pcd_flat = torch.cat([p.permute(0, 2, 3, 1).reshape(b, -1, 3) for p in pcd], 1)
+        # flatten RGBs and Pointclouds
+        rgb = [rp[0] for rp in rgb_pcd]
+        feat_size = rgb[0].shape[1]
+        flat_imag_features = torch.cat(
+            [p.permute(0, 2, 3, 1).reshape(b, -1, feat_size) for p in rgb], 1
+        )
+        # construct voxel grid
+        voxel_grid = self._voxelizer.coords_to_bounding_voxel_grid(
+            pcd_flat, coord_features=flat_imag_features, coord_bounds=bounds
+        )
+        # swap to channels fist
+        voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach()
+        # print(voxel_grid.shape) # [b, 10, 100, 100, 100]
+        # batch bounds if necessary
+        if bounds.shape[0] != b:
+            bounds = bounds.repeat(b, 1)
+        # print(lang_goal_emb.shape) # [B, 1024]
+        # forward pass
+        #TO DO: return more information
+        split_pred, L_skill, L_voxel = self._qnet(
+            voxel_grid,
+            proprio,
+            lang_goal_emb,
+            lang_token_embs,
+            prev_layer_voxel_grid,
+            bounds,
+            prev_bounds,
+        )
+        return split_pred, voxel_grid, L_skill, L_voxel
+class QAttentionPerActBCAgent(Agent):
+    def __init__(
+        self,
+        layer: int,
+        coordinate_bounds: list,
+        perceiver_encoder: nn.Module,
+        camera_names: list,
+        batch_size: int,
+        voxel_size: int,
+        bounds_offset: float,
+        voxel_feature_size: int,
+        image_crop_size: int,
+        num_rotation_classes: int,
+        rotation_resolution: float,
+        lr: float = 0.0001,
+        lr_scheduler: bool = False,
+        training_iterations: int = 100000,
+        num_warmup_steps: int = 20000,
+        trans_loss_weight: float = 1.0,
+        rot_loss_weight: float = 1.0,
+        grip_loss_weight: float = 1.0,
+        collision_loss_weight: float = 1.0,
+        include_low_dim_state: bool = False,
+        image_resolution: list = None,
+        lambda_weight_l2: float = 0.0,
+        transform_augmentation: bool = True,
+        transform_augmentation_xyz: list = [0.0, 0.0, 0.0],
+        transform_augmentation_rpy: list = [0.0, 0.0, 180.0],
+        transform_augmentation_rot_resolution: int = 5,
+        optimizer_type: str = "adam",
+        num_devices: int = 1,
+        anybimanual = False,
+        load_exists_weights = False,
+        frozen = False,
+        cfg = None,
+        aug_type = "standard",
+    ):
+        self.frozen = frozen
+        self.load = load_exists_weights
+        self._layer = layer
+        self._coordinate_bounds = coordinate_bounds
+        self._perceiver_encoder = perceiver_encoder
+        self._voxel_feature_size = voxel_feature_size
+        self._bounds_offset = bounds_offset
+        self._image_crop_size = image_crop_size
+        self._lr = lr
+        self._lr_scheduler = lr_scheduler
+        self._training_iterations = training_iterations
+        self._num_warmup_steps = num_warmup_steps
+        self._trans_loss_weight = trans_loss_weight
+        self._rot_loss_weight = rot_loss_weight
+        self._grip_loss_weight = grip_loss_weight
+        self._collision_loss_weight = collision_loss_weight
+        self._include_low_dim_state = include_low_dim_state
+        self._image_resolution = image_resolution or [128, 128]
+        self._voxel_size = voxel_size
+        self._camera_names = camera_names
+        self._num_cameras = len(camera_names)
+        self._batch_size = batch_size
+        self._lambda_weight_l2 = lambda_weight_l2
+        self._transform_augmentation = transform_augmentation
+        self._transform_augmentation_xyz = torch.from_numpy(
+            np.array(transform_augmentation_xyz)
+        )
+        self._transform_augmentation_rpy = transform_augmentation_rpy
+        self._transform_augmentation_rot_resolution = (
+            transform_augmentation_rot_resolution
+        )
+        self._optimizer_type = optimizer_type
+        self._num_devices = num_devices
+        self._num_rotation_classes = num_rotation_classes
+        self._rotation_resolution = rotation_resolution
+        self._cross_entropy_loss = nn.CrossEntropyLoss(reduction="none")
+        self._name = NAME + "_layer" + str(self._layer)
+        self.anybimanual = anybimanual
+        self.aug_type = aug_type
+        self.cfg = cfg
+    def build(self, training: bool, device: torch.device = None):
+        self._training = training
+        if device is None:
+            device = torch.device("cpu")
+        self._device = device
+        self._voxelizer = VoxelGrid(
+            coord_bounds=self._coordinate_bounds,
+            voxel_size=self._voxel_size,
+            device=device,
+            batch_size=self._batch_size if training else 1,
+            feature_size=self._voxel_feature_size,
+            max_num_coords=np.prod(self._image_resolution) * self._num_cameras,
+        )
+        self._q = (
+            QFunction(
+                self._perceiver_encoder,
+                self._voxelizer,
+                self._bounds_offset,
+                self._rotation_resolution,
+                device,
+                training,
+            )
+            .to(device)
+            .train(training)
+        )
+        grid_for_crop = (
+            torch.arange(0, self._image_crop_size, device=device)
+            .unsqueeze(0)
+            .repeat(self._image_crop_size, 1)
+            .unsqueeze(-1)
+        )
+        self._grid_for_crop = torch.cat(
+            [grid_for_crop.transpose(1, 0), grid_for_crop], dim=2
+        ).unsqueeze(0)
+        self._coordinate_bounds = torch.tensor(
+            self._coordinate_bounds, device=device
+        ).unsqueeze(0)
+        if self._training:
+            # optimizer
+            if self._optimizer_type == "lamb":
+                self._optimizer = Lamb(
+                    self._q.parameters(),
+                    lr=self._lr,
+                    weight_decay=self._lambda_weight_l2,
+                    betas=(0.9, 0.999),
+                    adam=False,
+                )
+            elif self._optimizer_type == "adam":
+                self._optimizer = torch.optim.Adam(
+                    self._q.parameters(),
+                    lr=self._lr,
+                    weight_decay=self._lambda_weight_l2,
+                )
+            else:
+                raise Exception("Unknown optimizer type")
+            # learning rate scheduler
+            if self._lr_scheduler:
+                self._scheduler = (
+                    transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
+                        self._optimizer,
+                        num_warmup_steps=self._num_warmup_steps,
+                        num_training_steps=self._training_iterations,
+                        num_cycles=self._training_iterations // 10000,
+                    )
+                )
+            # one-hot zero tensors
+            self._action_trans_one_hot_zeros = torch.zeros(
+                (
+                    self._batch_size,
+                    1,
+                    self._voxel_size,
+                    self._voxel_size,
+                    self._voxel_size,
+                ),
+                dtype=int,
+                device=device,
+            )
+            self._action_rot_x_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_rot_y_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_rot_z_one_hot_zeros = torch.zeros(
+                (self._batch_size, self._num_rotation_classes), dtype=int, device=device
+            )
+            self._action_grip_one_hot_zeros = torch.zeros(
+                (self._batch_size, 2), dtype=int, device=device
+            )
+            self._action_ignore_collisions_one_hot_zeros = torch.zeros(
+                (self._batch_size, 2), dtype=int, device=device
+            )
+            # print total params
+            logging.info(
+                "# Q Params: %d"
+                % sum(
+                    p.numel()
+                    for name, p in self._q.named_parameters()
+                    if p.requires_grad and "clip" not in name
+                )
+            )
+            # for name, p in self._q.named_parameters():
+            #     print(f"Param: {name}, requires_grad: {p.requires_grad}")
+        else:
+            for param in self._q.parameters():
+                param.requires_grad = False
+            # load CLIP for encoding language goals during evaluation
+            model, _ = load_clip("RN50", jit=False)
+            self._clip_rn50 = build_model(model.state_dict())
+            self._clip_rn50 = self._clip_rn50.float().to(device)
+            self._clip_rn50.eval()
+            del model
+            self._voxelizer.to(device)
+            self._q.to(device)
+    def _extract_crop(self, pixel_action, observation):
+        # Pixel action will now be (B, 2)
+        # observation = stack_on_channel(observation)
+        h = observation.shape[-1]
+        top_left_corner = torch.clamp(
+            pixel_action - self._image_crop_size // 2, 0, h - self._image_crop_size
+        )
+        grid = self._grid_for_crop + top_left_corner.unsqueeze(1)
+        grid = ((grid / float(h)) * 2.0) - 1.0  # between -1 and 1
+        # Used for cropping the images across a batch
+        # swap fro y x, to x, y
+        grid = torch.cat((grid[:, :, :, 1:2], grid[:, :, :, 0:1]), dim=-1)
+        crop = F.grid_sample(observation, grid, mode="nearest", align_corners=True)
+        return crop
+    def _preprocess_inputs(self, replay_sample):
+        obs = []
+        pcds = []
+        rgbs = []
+        self._crop_summary = []
+        for n in self._camera_names:
+            rgb = replay_sample["%s_rgb" % n]
+            pcd = replay_sample["%s_point_cloud" % n]
+            obs.append([rgb, pcd])
+            pcds.append(pcd)
+            rgbs.append(rgb)
+        return obs, pcds, rgbs
+    def _act_preprocess_inputs(self, observation):
+        obs, pcds = [], []
+        for n in self._camera_names:
+            rgb = observation["%s_rgb" % n]
+            pcd = observation["%s_point_cloud" % n]
+            obs.append([rgb, pcd])
+            pcds.append(pcd)
+        return obs, pcds
+    def _get_value_from_voxel_index(self, q, voxel_idx):
+        b, c, d, h, w = q.shape
+        q_trans_flat = q.view(b, c, d * h * w)
+        flat_indicies = (
+            voxel_idx[:, 0] * d * h + voxel_idx[:, 1] * h + voxel_idx[:, 2]
+        )[:, None].int()
+        highest_idxs = flat_indicies.unsqueeze(-1).repeat(1, c, 1)
+        chosen_voxel_values = q_trans_flat.gather(2, highest_idxs)[
+            ..., 0
+        ]  # (B, trans + rot + grip)
+        return chosen_voxel_values
+    def _get_value_from_rot_and_grip(self, rot_grip_q, rot_and_grip_idx):
+        q_rot = torch.stack(
+            torch.split(
+                rot_grip_q[:, :-2], int(360 // self._rotation_resolution), dim=1
+            ),
+            dim=1,
+        )  # B, 3, 72
+        q_grip = rot_grip_q[:, -2:]
+        rot_and_grip_values = torch.cat(
+            [
+                q_rot[:, 0].gather(1, rot_and_grip_idx[:, 0:1]),
+                q_rot[:, 1].gather(1, rot_and_grip_idx[:, 1:2]),
+                q_rot[:, 2].gather(1, rot_and_grip_idx[:, 2:3]),
+                q_grip.gather(1, rot_and_grip_idx[:, 3:4]),
+            ],
+            -1,
+        )
+        return rot_and_grip_values
+    def _celoss(self, pred, labels):
+        return self._cross_entropy_loss(pred, labels.argmax(-1))
+    def _softmax_q_trans(self, q):
+        q_shape = q.shape
+        return F.softmax(q.reshape(q_shape[0], -1), dim=1).reshape(q_shape)
+    def _softmax_q_rot_grip(self, q_rot_grip):
+        q_rot_x_flat = q_rot_grip[
+            :, 0 * self._num_rotation_classes : 1 * self._num_rotation_classes
+        ]
+        q_rot_y_flat = q_rot_grip[
+            :, 1 * self._num_rotation_classes : 2 * self._num_rotation_classes
+        ]
+        q_rot_z_flat = q_rot_grip[
+            :, 2 * self._num_rotation_classes : 3 * self._num_rotation_classes
+        ]
+        q_grip_flat = q_rot_grip[:, 3 * self._num_rotation_classes :]
+        q_rot_x_flat_softmax = F.softmax(q_rot_x_flat, dim=1)
+        q_rot_y_flat_softmax = F.softmax(q_rot_y_flat, dim=1)
+        q_rot_z_flat_softmax = F.softmax(q_rot_z_flat, dim=1)
+        q_grip_flat_softmax = F.softmax(q_grip_flat, dim=1)
+        return torch.cat(
+            [
+                q_rot_x_flat_softmax,
+                q_rot_y_flat_softmax,
+                q_rot_z_flat_softmax,
+                q_grip_flat_softmax,
+            ],
+            dim=1,
+        )
+    def _softmax_ignore_collision(self, q_collision):
+        q_collision_softmax = F.softmax(q_collision, dim=1)
+        return q_collision_softmax
+    def update(self, step: int, replay_sample: dict) -> dict:
+        if step > 50:
+            for name, param in self._q.named_parameters():
+                if 'fc1_right' in name:
+                    param.requires_grad = False
+                if 'fc1_left' in name:
+                    param.requires_grad = False
+        right_action_trans = replay_sample["right_trans_action_indicies"][
+            ..., self._layer * 3 : self._layer * 3 + 3
+        ].int()
+        right_action_rot_grip = replay_sample["right_rot_grip_action_indicies"].int()
+        right_action_gripper_pose = replay_sample["right_gripper_pose"]
+        right_action_ignore_collisions = replay_sample["right_ignore_collisions"].int()
+        left_action_trans = replay_sample["left_trans_action_indicies"][
+            ..., self._layer * 3 : self._layer * 3 + 3
+        ].int()
+        left_action_rot_grip = replay_sample["left_rot_grip_action_indicies"].int()
+        left_action_gripper_pose = replay_sample["left_gripper_pose"]
+        left_action_ignore_collisions = replay_sample["left_ignore_collisions"].int()
+        lang_goal_emb = replay_sample["lang_goal_emb"].float()
+        lang_token_embs = replay_sample["lang_token_embs"].float()
+        prev_layer_voxel_grid = replay_sample.get("prev_layer_voxel_grid", None)
+        prev_layer_bounds = replay_sample.get("prev_layer_bounds", None)
+        device = self._device
+        rank = device
+        bounds = self._coordinate_bounds.to(device)
+        if self._layer > 0:
+            right_cp = replay_sample[
+                "right_attention_coordinate_layer_%d" % (self._layer - 1)
+            ]
+            left_cp = replay_sample[
+                "left_attention_coordinate_layer_%d" % (self._layer - 1)
+            ]
+            right_bounds = torch.cat(
+                [right_cp - self._bounds_offset, right_cp + self._bounds_offset], dim=1
+            )
+            left_bounds = torch.cat(
+                [left_cp - self._bounds_offset, left_cp + self._bounds_offset], dim=1
+            )
+        else:
+            right_bounds = bounds
+            left_bounds = bounds
+        right_proprio = None
+        left_proprio = None
+        if self._include_low_dim_state:
+            right_proprio = replay_sample["right_low_dim_state"]
+            left_proprio = replay_sample["left_low_dim_state"]
+        # ..TODO::
+        # Can we add the coordinates of both robots?
+        #
+        obs, pcd, rgbs = self._preprocess_inputs(replay_sample)
+        # batch size
+        bs = pcd[0].shape[0]
+        # We can move the point cloud w.r.t to the other robot's cooridinate system
+        # similar to apply_se3_augmentation
+        # SE(3) augmentation of point clouds and actions
+        if self._transform_augmentation:
+            from voxel import augmentation, augmentation_ab
+            if self.aug_type == "ab":
+                (
+                    right_action_trans,
+                    right_action_rot_grip,
+                    left_action_trans,
+                    left_action_rot_grip,
+                    pcd,
+                ) = augmentation_ab.bimanual_apply_se3_augmentation(
+                    pcd,
+                    right_action_gripper_pose,
+                    right_action_trans,
+                    right_action_rot_grip,
+                    left_action_gripper_pose,
+                    left_action_trans,
+                    left_action_rot_grip,
+                    bounds,
+                    self._layer,
+                    self._transform_augmentation_xyz,
+                    self._transform_augmentation_rpy,
+                    self._transform_augmentation_rot_resolution,
+                    self._voxel_size,
+                    self._rotation_resolution,
+                    self._device,
+                )
+            else:
+                (
+                    right_action_trans,
+                    right_action_rot_grip,
+                    left_action_trans,
+                    left_action_rot_grip,
+                    pcd,
+                ) = augmentation.bimanual_apply_se3_augmentation(
+                    pcd,
+                    right_action_gripper_pose,
+                    right_action_trans,
+                    right_action_rot_grip,
+                    left_action_gripper_pose,
+                    left_action_trans,
+                    left_action_rot_grip,
+                    bounds,
+                    self._layer,
+                    self._transform_augmentation_xyz,
+                    self._transform_augmentation_rpy,
+                    self._transform_augmentation_rot_resolution,
+                    self._voxel_size,
+                    self._rotation_resolution,
+                    self._device,
+                )
+        else:
+            right_action_trans = right_action_trans.int()
+            left_action_trans = left_action_trans.int()
+        proprio = torch.cat((right_proprio, left_proprio), dim=1)
+        right_action = (
+            right_action_trans,
+            right_action_rot_grip,
+            right_action_ignore_collisions,
+        )
+        left_action = (
+            left_action_trans,
+            left_action_rot_grip,
+            left_action_ignore_collisions,
+        )
+        # forward pass
+        q, voxel_grid, L_skill, L_voxel = self._q(
+            obs,
+            proprio,
+            pcd,
+            lang_goal_emb,
+            lang_token_embs,
+            bounds,
+            prev_layer_bounds,
+            prev_layer_voxel_grid,
+        )
+        (
+            right_q_trans,
+            right_q_rot_grip,
+            right_q_collision,
+            left_q_trans,
+            left_q_rot_grip,
+            left_q_collision,
+        ) = q
+        # argmax to choose best action
+        (
+            right_coords,
+            right_rot_and_grip_indicies,
+            right_ignore_collision_indicies,
+        ) = self._q.choose_highest_action(
+            right_q_trans, right_q_rot_grip, right_q_collision
+        )
+        (
+            left_coords,
+            left_rot_and_grip_indicies,
+            left_ignore_collision_indicies,
+        ) = self._q.choose_highest_action(
+            left_q_trans, left_q_rot_grip, left_q_collision
+        )
+        right_q_trans_loss, right_q_rot_loss, right_q_grip_loss, right_q_collision_loss = 0.0, 0.0, 0.0, 0.0
+        left_q_trans_loss, left_q_rot_loss, left_q_grip_loss, left_q_collision_loss = 0.0, 0.0, 0.0, 0.0
+        # translation one-hot
+        right_action_trans_one_hot = self._action_trans_one_hot_zeros.clone().detach()
+        left_action_trans_one_hot = self._action_trans_one_hot_zeros.clone().detach()
+        for b in range(bs):
+            right_gt_coord = right_action_trans[b, :].int()
+            right_action_trans_one_hot[
+                b, :, right_gt_coord[0], right_gt_coord[1], right_gt_coord[2]
+            ] = 1
+            left_gt_coord = left_action_trans[b, :].int()
+            left_action_trans_one_hot[
+                b, :, left_gt_coord[0], left_gt_coord[1], left_gt_coord[2]
+            ] = 1
+        # translation loss
+        right_q_trans_flat = right_q_trans.view(bs, -1)
+        right_action_trans_one_hot_flat = right_action_trans_one_hot.view(bs, -1)
+        right_q_trans_loss = self._celoss(
+            right_q_trans_flat, right_action_trans_one_hot_flat
+        )
+        left_q_trans_flat = left_q_trans.view(bs, -1)
+        left_action_trans_one_hot_flat = left_action_trans_one_hot.view(bs, -1)
+        left_q_trans_loss = self._celoss(
+            left_q_trans_flat, left_action_trans_one_hot_flat
+        )
+        q_trans_loss = right_q_trans_loss + left_q_trans_loss
+        with_rot_and_grip = (
+            len(right_rot_and_grip_indicies) > 0 and len(left_rot_and_grip_indicies) > 0
+        )
+        if with_rot_and_grip:
+            # rotation, gripper, and collision one-hots
+            right_action_rot_x_one_hot = self._action_rot_x_one_hot_zeros.clone()
+            right_action_rot_y_one_hot = self._action_rot_y_one_hot_zeros.clone()
+            right_action_rot_z_one_hot = self._action_rot_z_one_hot_zeros.clone()
+            right_action_grip_one_hot = self._action_grip_one_hot_zeros.clone()
+            right_action_ignore_collisions_one_hot = (
+                self._action_ignore_collisions_one_hot_zeros.clone()
+            )
+            left_action_rot_x_one_hot = self._action_rot_x_one_hot_zeros.clone()
+            left_action_rot_y_one_hot = self._action_rot_y_one_hot_zeros.clone()
+            left_action_rot_z_one_hot = self._action_rot_z_one_hot_zeros.clone()
+            left_action_grip_one_hot = self._action_grip_one_hot_zeros.clone()
+            left_action_ignore_collisions_one_hot = (
+                self._action_ignore_collisions_one_hot_zeros.clone()
+            )
+            for b in range(bs):
+                right_gt_rot_grip = right_action_rot_grip[b, :].int()
+                right_action_rot_x_one_hot[b, right_gt_rot_grip[0]] = 1
+                right_action_rot_y_one_hot[b, right_gt_rot_grip[1]] = 1
+                right_action_rot_z_one_hot[b, right_gt_rot_grip[2]] = 1
+                right_action_grip_one_hot[b, right_gt_rot_grip[3]] = 1
+                right_gt_ignore_collisions = right_action_ignore_collisions[b, :].int()
+                right_action_ignore_collisions_one_hot[
+                    b, right_gt_ignore_collisions[0]
+                ] = 1
+                left_gt_rot_grip = left_action_rot_grip[b, :].int()
+                left_action_rot_x_one_hot[b, left_gt_rot_grip[0]] = 1
+                left_action_rot_y_one_hot[b, left_gt_rot_grip[1]] = 1
+                left_action_rot_z_one_hot[b, left_gt_rot_grip[2]] = 1
+                left_action_grip_one_hot[b, left_gt_rot_grip[3]] = 1
+                left_gt_ignore_collisions = left_action_ignore_collisions[b, :].int()
+                left_action_ignore_collisions_one_hot[
+                    b, left_gt_ignore_collisions[0]
+                ] = 1
+            # flatten predictions
+            right_q_rot_x_flat = right_q_rot_grip[
+                :, 0 * self._num_rotation_classes : 1 * self._num_rotation_classes
+            ]
+            right_q_rot_y_flat = right_q_rot_grip[
+                :, 1 * self._num_rotation_classes : 2 * self._num_rotation_classes
+            ]
+            right_q_rot_z_flat = right_q_rot_grip[
+                :, 2 * self._num_rotation_classes : 3 * self._num_rotation_classes
+            ]
+            right_q_grip_flat = right_q_rot_grip[:, 3 * self._num_rotation_classes :]
+            right_q_ignore_collisions_flat = right_q_collision
+            left_q_rot_x_flat = left_q_rot_grip[
+                :, 0 * self._num_rotation_classes : 1 * self._num_rotation_classes
+            ]
+            left_q_rot_y_flat = left_q_rot_grip[
+                :, 1 * self._num_rotation_classes : 2 * self._num_rotation_classes
+            ]
+            left_q_rot_z_flat = left_q_rot_grip[
+                :, 2 * self._num_rotation_classes : 3 * self._num_rotation_classes
+            ]
+            left_q_grip_flat = left_q_rot_grip[:, 3 * self._num_rotation_classes :]
+            left_q_ignore_collisions_flat = left_q_collision
+            # rotation loss
+            right_q_rot_loss += self._celoss(right_q_rot_x_flat, right_action_rot_x_one_hot)
+            right_q_rot_loss += self._celoss(right_q_rot_y_flat, right_action_rot_y_one_hot)
+            right_q_rot_loss += self._celoss(right_q_rot_z_flat, right_action_rot_z_one_hot)
+            left_q_rot_loss += self._celoss(left_q_rot_x_flat, left_action_rot_x_one_hot)
+            left_q_rot_loss += self._celoss(left_q_rot_y_flat, left_action_rot_y_one_hot)
+            left_q_rot_loss += self._celoss(left_q_rot_z_flat, left_action_rot_z_one_hot)
+            # gripper loss
+            right_q_grip_loss += self._celoss(right_q_grip_flat, right_action_grip_one_hot)
+            left_q_grip_loss += self._celoss(left_q_grip_flat, left_action_grip_one_hot)
+            # collision loss
+            right_q_collision_loss += self._celoss(
+                right_q_ignore_collisions_flat, right_action_ignore_collisions_one_hot
+            )
+            left_q_collision_loss += self._celoss(
+                left_q_ignore_collisions_flat, left_action_ignore_collisions_one_hot
+            )
+        q_trans_loss = right_q_trans_loss + left_q_trans_loss
+        q_rot_loss = right_q_rot_loss + left_q_rot_loss
+        q_grip_loss = right_q_grip_loss + left_q_grip_loss
+        q_collision_loss = right_q_collision_loss + left_q_collision_loss
+        combined_losses = (
+            (q_trans_loss * self._trans_loss_weight)
+            + (q_rot_loss * self._rot_loss_weight)
+            + (q_grip_loss * self._grip_loss_weight)
+            + (q_collision_loss * self._collision_loss_weight)
+            + 0.0001*L_skill
+            + 0.01*L_voxel
+        )
+        total_loss = combined_losses.mean()
+        if step % 10 == 0 and rank == 0 and wandb.run is not None:
+            wandb.log({
+                'train/grip_loss': q_grip_loss.mean(),
+                'train/trans_loss': q_trans_loss.mean(),
+                'train/rot_loss': q_rot_loss.mean(),
+                'train/collision_loss': q_collision_loss.mean(),
+                'train/total_loss': total_loss,
+            }, step=step)
+        torch.autograd.set_detect_anomaly(True)
+        self._optimizer.zero_grad()
+        total_loss.backward()
+        self._optimizer.step()
+        torch.cuda.empty_cache()
+        self._summaries = {
+            "losses/total_loss": total_loss,
+            "losses/trans_loss": q_trans_loss.mean(),
+            "losses/rot_loss": q_rot_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/grip_loss": q_grip_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/right/trans_loss": q_trans_loss.mean(),
+            "losses/right/rot_loss": q_rot_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/right/grip_loss": q_grip_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/right/collision_loss": q_collision_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/left/trans_loss": q_trans_loss.mean(),
+            "losses/left/rot_loss": q_rot_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/left/grip_loss": q_grip_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/left/collision_loss": q_collision_loss.mean() if with_rot_and_grip else 0.0,
+            "losses/collision_loss": q_collision_loss.mean()
+            if with_rot_and_grip
+            else 0.0,
+        }
+        self._wandb_summaries = {
+            'losses/total_loss': total_loss,
+            'losses/trans_loss': q_trans_loss.mean(),
+            'losses/rot_loss': q_rot_loss.mean() if with_rot_and_grip else 0.,
+            'losses/grip_loss': q_grip_loss.mean() if with_rot_and_grip else 0.,
+            'losses/collision_loss': q_collision_loss.mean() if with_rot_and_grip else 0.
+        }
+        if self._lr_scheduler:
+            self._scheduler.step()
+            self._summaries["learning_rate"] = self._scheduler.get_last_lr()[0]
+        self._vis_voxel_grid = voxel_grid[0]
+        self._right_vis_translation_qvalue = self._softmax_q_trans(right_q_trans[0])
+        self._right_vis_max_coordinate = right_coords[0]
+        self._right_vis_gt_coordinate = right_action_trans[0]
+        self._left_vis_translation_qvalue = self._softmax_q_trans(left_q_trans[0])
+        self._left_vis_max_coordinate = left_coords[0]
+        self._left_vis_gt_coordinate = left_action_trans[0]
+        # Note: PerAct doesn't use multi-layer voxel grids like C2FARM
+        # stack prev_layer_voxel_grid(s) from previous layers into a list
+        if prev_layer_voxel_grid is None:
+            prev_layer_voxel_grid = [voxel_grid]
+        else:
+            prev_layer_voxel_grid = prev_layer_voxel_grid + [voxel_grid]
+        # stack prev_layer_bound(s) from previous layers into a list
+        if prev_layer_bounds is None:
+            prev_layer_bounds = [self._coordinate_bounds.repeat(bs, 1)]
+        else:
+            prev_layer_bounds = prev_layer_bounds + [bounds]
+        q_trans_vis=True
+        log_freq = getattr(getattr(getattr(self, "cfg", None), "framework", None), "log_freq", None)
+        if log_freq and step % log_freq == 0 and rank == 0:
+            print("right_predict: ",self._right_vis_max_coordinate)
+            print("right_gt: ",self._right_vis_gt_coordinate)
+            print("left_predict: ",self._left_vis_max_coordinate)
+            print("left_gt: ",self._left_vis_gt_coordinate)
+            rendered_img_right = visualise_voxel(
+                voxel_grid[0].cpu().detach().numpy(),    # [10, 100, 100, 100]
+                self._right_vis_translation_qvalue.detach().cpu().numpy() if q_trans_vis else None,
+                self._right_vis_max_coordinate.detach().cpu().numpy(),
+                self._right_vis_gt_coordinate.detach().cpu().numpy(),
+                voxel_size=0.045,
+                # voxel_size=0.1,   # more focus ??
+                rotation_amount=np.deg2rad(-90),
+                highlight_alpha=1.0,
+                alpha=0.4,
+            )
+            rendered_img_left = visualise_voxel(
+                voxel_grid[0].cpu().detach().numpy(),    # [10, 100, 100, 100]
+                self._left_vis_translation_qvalue.detach().cpu().numpy() if q_trans_vis else None,
+                self._left_vis_max_coordinate.detach().cpu().numpy(),
+                self._left_vis_gt_coordinate.detach().cpu().numpy(),
+                voxel_size=0.045,
+                # voxel_size=0.1,   # more focus ??
+                rotation_amount=np.deg2rad(-90),
+                highlight_alpha=1.0,
+                alpha=0.4,
+            )
+            os.makedirs('recon', exist_ok=True)
+            # plot three images in one row with subplots:
+            rgb_src = obs[0][0][0].squeeze(0).permute(1, 2, 0)  / 2 + 0.5
+            fig, axs = plt.subplots(1, 4, figsize=(9, 3))
+            # src
+            axs[0].imshow(rgb_src.cpu().numpy())
+            axs[0].title.set_text('src')
+            axs[1].imshow(rendered_img_right)
+            axs[1].text(0, 40, 'predicted', color='blue')
+            axs[1].text(0, 80, 'gt', color='red')
+            axs[2].imshow(rendered_img_left)
+            axs[2].text(0, 40, 'predicted', color='blue')
+            axs[2].text(0, 80, 'gt', color='red')
+            for ax in axs:
+                ax.axis('off')
+            plt.tight_layout()
+            if rank == 0:
+                if wandb.run is not None:
+                    buf = io.BytesIO()
+                    plt.savefig(buf, format='png')
+                    buf.seek(0)
+                    image = Image.open(buf)
+                    wandb.log({"eval/recon_img": wandb.Image(image)}, step=step)
+                    buf.close()
+                    cprint(f'Saved to wandb', 'cyan')
+                else:
+                    plt.savefig(f'recon/{step}_rgb.png')
+                    workdir = os.getcwd()
+                    cprint(f'Saved {workdir}/recon/{step}_rgb.png locally', 'cyan')
+        return {
+            "total_loss": total_loss,
+            "prev_layer_voxel_grid": prev_layer_voxel_grid,
+            "prev_layer_bounds": prev_layer_bounds,
+        }
+    def act(self, step: int,observation: dict,deterministic=False) -> ActResult:
+        deterministic = True
+        bounds = self._coordinate_bounds
+        prev_layer_voxel_grid = observation.get("prev_layer_voxel_grid", None)
+        prev_layer_bounds = observation.get("prev_layer_bounds", None)
+        lang_goal_tokens = observation.get("lang_goal_tokens", None).long()
+        # extract CLIP language embs
+        with torch.no_grad():
+            lang_goal_tokens = lang_goal_tokens.to(device=self._device)
+            (
+                lang_goal_emb,
+                lang_token_embs,
+            ) = self._clip_rn50.encode_text_with_embeddings(lang_goal_tokens[0])
+        # voxelization resolution
+        res = (bounds[:, 3:] - bounds[:, :3]) / self._voxel_size
+        max_rot_index = int(360 // self._rotation_resolution)
+        right_proprio = None
+        left_proprio = None
+        if self._include_low_dim_state:
+            right_proprio = observation["right_low_dim_state"]
+            left_proprio = observation["left_low_dim_state"]
+            right_proprio = right_proprio[0].to(self._device)
+            left_proprio = left_proprio[0].to(self._device)
+        obs, pcd = self._act_preprocess_inputs(observation)
+        # correct batch size and device
+        obs = [[o[0][0].to(self._device), o[1][0].to(self._device)] for o in obs]
+        pcd = [p[0].to(self._device) for p in pcd]
+        lang_goal_emb = lang_goal_emb.to(self._device)
+        lang_token_embs = lang_token_embs.to(self._device)
+        bounds = torch.as_tensor(bounds, device=self._device)
+        prev_layer_voxel_grid = (
+            prev_layer_voxel_grid.to(self._device)
+            if prev_layer_voxel_grid is not None
+            else None
+        )
+        prev_layer_bounds = (
+            prev_layer_bounds.to(self._device)
+            if prev_layer_bounds is not None
+            else None
+        )
+        proprio = torch.cat((right_proprio, left_proprio), dim=1)
+        # inference
+        (
+            right_q_trans,
+            right_q_rot_grip,
+            right_q_ignore_collisions,
+            left_q_trans,
+            left_q_rot_grip,
+            left_q_ignore_collisions,
+        ), vox_grid = self._q(
+            obs,
+            proprio,
+            pcd,
+            lang_goal_emb,
+            lang_token_embs,
+            bounds,
+            prev_layer_bounds,
+            prev_layer_voxel_grid
+        )
+        # softmax Q predictions
+        right_q_trans = self._softmax_q_trans(right_q_trans)
+        left_q_trans = self._softmax_q_trans(left_q_trans)
+        if right_q_rot_grip is not None:
+            right_q_rot_grip = self._softmax_q_rot_grip(right_q_rot_grip)
+        if left_q_rot_grip is not None:
+            left_q_rot_grip = self._softmax_q_rot_grip(left_q_rot_grip)
+        if right_q_ignore_collisions is not None:
+            right_q_ignore_collisions = self._softmax_ignore_collision(
+                right_q_ignore_collisions
+            )
+        if left_q_ignore_collisions is not None:
+            left_q_ignore_collisions = self._softmax_ignore_collision(
+                left_q_ignore_collisions
+            )
+        # argmax Q predictions
+        (
+            right_coords,
+            right_rot_and_grip_indicies,
+            right_ignore_collisions,
+        ) = self._q.choose_highest_action(
+            right_q_trans, right_q_rot_grip, right_q_ignore_collisions
+        )
+        (
+            left_coords,
+            left_rot_and_grip_indicies,
+            left_ignore_collisions,
+        ) = self._q.choose_highest_action(
+            left_q_trans, left_q_rot_grip, left_q_ignore_collisions
+        )
+        if right_q_rot_grip is not None:
+            right_rot_grip_action = right_rot_and_grip_indicies
+        if right_q_ignore_collisions is not None:
+            right_ignore_collisions_action = right_ignore_collisions.int()
+        if left_q_rot_grip is not None:
+            left_rot_grip_action = left_rot_and_grip_indicies
+        if left_q_ignore_collisions is not None:
+            left_ignore_collisions_action = left_ignore_collisions.int()
+        right_coords = right_coords.int()
+        left_coords = left_coords.int()
+        right_attention_coordinate = bounds[:, :3] + res * right_coords + res / 2
+        left_attention_coordinate = bounds[:, :3] + res * left_coords + res / 2
+        # stack prev_layer_voxel_grid(s) into a list
+        # NOTE: PerAct doesn't used multi-layer voxel grids like C2FARM
+        if prev_layer_voxel_grid is None:
+            prev_layer_voxel_grid = [vox_grid]
+        else:
+            prev_layer_voxel_grid = prev_layer_voxel_grid + [vox_grid]
+        if prev_layer_bounds is None:
+            prev_layer_bounds = [bounds]
+        else:
+            prev_layer_bounds = prev_layer_bounds + [bounds]
+        observation_elements = {
+            "right_attention_coordinate": right_attention_coordinate,
+            "left_attention_coordinate": left_attention_coordinate,
+            "prev_layer_voxel_grid": prev_layer_voxel_grid,
+            "prev_layer_bounds": prev_layer_bounds,
+        }
+        info = {
+            "voxel_grid_depth%d" % self._layer: vox_grid,
+            "right_q_depth%d" % self._layer: right_q_trans,
+            "right_voxel_idx_depth%d" % self._layer: right_coords,
+            "left_q_depth%d" % self._layer: left_q_trans,
+            "left_voxel_idx_depth%d" % self._layer: left_coords,
+        }
+        self._act_voxel_grid = vox_grid[0]
+        self._right_act_max_coordinate = right_coords[0]
+        self._right_act_qvalues = right_q_trans[0].detach()
+        self._left_act_max_coordinate = left_coords[0]
+        self._left_act_qvalues = left_q_trans[0].detach()
+        action = (
+            right_coords,
+            right_rot_grip_action,
+            right_ignore_collisions,
+            left_coords,
+            left_rot_grip_action,
+            left_ignore_collisions,
+        )
+        return ActResult(action, observation_elements=observation_elements, info=info)
+    def update_summaries(self) -> List[Summary]:
+        # voxel_grid = self._vis_voxel_grid.detach().cpu().numpy()
+        summaries = []
+        # summaries.append(
+        #     ImageSummary(
+        #         "%s/right_update_qattention" % self._name,
+        #         transforms.ToTensor()(
+        #             visualise_voxel(
+        #                 voxel_grid,
+        #                 self._right_vis_translation_qvalue.detach().cpu().numpy(),
+        #                 self._right_vis_max_coordinate.detach().cpu().numpy(),
+        #                 self._right_vis_gt_coordinate.detach().cpu().numpy(),
+        #             )
+        #         ),
+        #     )
+        # )
+        # summaries.append(
+        #     ImageSummary(
+        #         "%s/left_update_qattention" % self._name,
+        #         transforms.ToTensor()(
+        #             visualise_voxel(
+        #                 voxel_grid,
+        #                 self._left_vis_translation_qvalue.detach().cpu().numpy(),
+        #                 self._left_vis_max_coordinate.detach().cpu().numpy(),
+        #                 self._left_vis_gt_coordinate.detach().cpu().numpy(),
+        #             )
+        #         ),
+        #     )
+        # )
+        for n, v in self._summaries.items():
+            summaries.append(ScalarSummary("%s/%s" % (self._name, n), v))
+        for name, crop in self._crop_summary:
+            crops = (torch.cat(torch.split(crop, 3, dim=1), dim=3) + 1.0) / 2.0
+            summaries.extend([ImageSummary("%s/crops/%s" % (self._name, name), crops)])
+        for tag, param in self._q.named_parameters():
+            # assert not torch.isnan(param.grad.abs() <= 1.0).all()
+            summaries.append(
+                HistogramSummary("%s/gradient/%s" % (self._name, tag), param.grad)
+            )
+            summaries.append(
+                HistogramSummary("%s/weight/%s" % (self._name, tag), param.data)
+            )
+        return summaries
+    def update_wandb_summaries(self):
+        summaries = dict()
+        for k, v in self._wandb_summaries.items():
+            summaries[k] = v
+        return summaries
+    def act_summaries(self) -> List[Summary]:
+        # voxel_grid = self._act_voxel_grid.cpu().numpy()
+        # right_q_attention = self._right_act_qvalues.cpu().numpy()
+        # right_highlight_coordinate = self._right_act_max_coordinate.cpu().numpy()
+        # right_visualization = visualise_voxel(
+        #     voxel_grid, right_q_attention, right_highlight_coordinate
+        # )
+        # left_q_attention = self._left_act_qvalues.cpu().numpy()
+        # left_highlight_coordinate = self._left_act_max_coordinate.cpu().numpy()
+        # left_visualization = visualise_voxel(
+        #     voxel_grid, left_q_attention, left_highlight_coordinate
+        # )
+        # return [
+        #     ImageSummary(
+        #         f"{self._name}/right_act_Qattention",
+        #         transforms.ToTensor()(right_visualization),
+        #     ),
+        #     ImageSummary(
+        #         f"{self._name}/left_act_Qattention",
+        #         transforms.ToTensor()(left_visualization),
+        #     ),
+        # ]
+        return []
+    def concat_weights(self, param, target_size, dims=-1):
+        if param.size(-1) < target_size:
+            param = torch.cat([param, param], dims)
+        return param
+    def load_weights(self, savedir: str):
+        device = (
+            self._device
+            if not self._training
+            else torch.device("cuda:%d" % self._device)
+        )
+        weight_file = os.path.join(savedir, "%s.pt" % self._name)
+        state_dict = torch.load(weight_file, map_location=device)
+        merged_state_dict = self._q.state_dict()
+        if not self._training:
+            for k, v in state_dict.items():
+                if not self._training:
+                    k = k.replace("_qnet.module", "_qnet")
+                if k in merged_state_dict:
+                    merged_state_dict[k] = v
+                else:
+                    if "_voxelizer" not in k:
+                        logging.warning("key %s not found in checkpoint" % k)
+        else:
+            for k, v in state_dict.items():
+                if not self._training:
+                    k = k.replace("_qnet.module", "_qnet")
+                # cross_attn
+                if k.startswith("_qnet.module.decoder_cross_attn"):
+                    right_key = k.replace("_qnet.module.decoder_cross_attn", "_qnet.module.decoder_cross_attn_right")
+                    merged_state_dict[right_key] = v
+                    left_key = k.replace("_qnet.module.decoder_cross_attn", "_qnet.module.decoder_cross_attn_left")
+                    merged_state_dict[left_key] = v
+                    if self.anybimanual:
+                        if v.size(0) == 128:
+                            merged_state_dict[right_key] = self.concat_weights(v, 256, 0)
+                            merged_state_dict[left_key] = self.concat_weights(v, 256, 0)
+                        if v.size(-1) == 128:
+                            merged_state_dict[right_key] = self.concat_weights(v, 256)
+                            merged_state_dict[left_key] = self.concat_weights(v, 256)
+                elif k == "_qnet.module.up0.conv_up.0.conv3d.weight":
+                    if self.anybimanual:
+                        if v.size(1) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256, 1)
+                    else:
+                        merged_state_dict[k] = v
+                # trans_decoder
+                elif k.startswith("_qnet.module.trans_decoder"):
+                    right_key = k.replace("_qnet.module.trans_decoder", "_qnet.module.right_trans_decoder")
+                    merged_state_dict[right_key] = v
+                    left_key = k.replace("_qnet.module.trans_decoder", "_qnet.module.left_trans_decoder")
+                    merged_state_dict[left_key] = v
+                # dense0
+                elif k.startswith("_qnet.module.dense0"):
+                    right_key = k.replace("_qnet.module.dense0", "_qnet.module.right_dense0")
+                    merged_state_dict[right_key] = v
+                    left_key = k.replace("_qnet.module.dense0", "_qnet.module.left_dense0")
+                    merged_state_dict[left_key] = v
+                    if self.anybimanual:
+                        if v.size(-1) == 1024:
+                            merged_state_dict[right_key] = torch.cat([v, v[:, :512]], dim=-1)
+                            merged_state_dict[left_key] = torch.cat([v, v[:, :512]], dim=-1)
+                # dense1
+                elif k.startswith("_qnet.module.dense1"):
+                    right_key = k.replace("_qnet.module.dense1", "_qnet.module.right_dense1")
+                    merged_state_dict[right_key] = v
+                    left_key = k.replace("_qnet.module.dense1", "_qnet.module.left_dense1")
+                    merged_state_dict[left_key] = v
+                # collision
+                elif k.startswith("_qnet.module.rot_grip_collision_ff"):
+                    right_key = k.replace("_qnet.module.rot_grip_collision_ff", "_qnet.module.right_rot_grip_collision_ff")
+                    merged_state_dict[right_key] = v
+                    left_key = k.replace("_qnet.module.rot_grip_collision_ff", "_qnet.module.left_rot_grip_collision_ff")
+                    merged_state_dict[left_key] = v
+                elif k.startswith("_qnet.module.cross_attend_blocks"):
+                    if self.anybimanual:
+                        if k.startswith("_qnet.module.cross_attend_blocks.0"):
+                            merged_state_dict[k] = v
+                            k_1 = k.replace("_qnet.module.cross_attend_blocks.0","_qnet.module.cross_attend_blocks.1")
+                            merged_state_dict[k_1] = v
+                            if self.anybimanual:
+                                if v.size(-1) == 128:
+                                    merged_state_dict[k_1] = self.concat_weights(v, 256)
+                        else:
+                            k_2 = k.replace("_qnet.module.cross_attend_blocks.1","_qnet.module.cross_attend_blocks.2")
+                            k_3 = k.replace("_qnet.module.cross_attend_blocks.1","_qnet.module.cross_attend_blocks.3")
+                            merged_state_dict[k_2] = v
+                            merged_state_dict[k_3] = v
+                            if self.anybimanual:
+                                if v.size(-1) == 128:
+                                    merged_state_dict[k_2] = self.concat_weights(v, 256)
+                                    merged_state_dict[k_3] = self.concat_weights(v, 256)
+                    else:
+                        if k.startswith("_qnet.module.cross_attend_blocks.0"):
+                            merged_state_dict[k] = v
+                        else:
+                            merged_state_dict[k] = v
+                            k_2 = k.replace("_qnet.module.cross_attend_blocks.1","_qnet.module.cross_attend_blocks.2")
+                            merged_state_dict[k_2] = v
+                            if self.anybimanual:
+                                if v.size(-1) == 128:
+                                    merged_state_dict[k_2] = self.concat_weights(v, 256)
+                    if self.anybimanual:
+                        if v.size(-1) == 128:
+                            merged_state_dict[k] = self.concat_weights(v, 256)
+                # proprio
+                elif k == '_qnet.module.proprio_preprocess.linear.weight':
+                    if v.shape[1] != 8:
+                        new_v = torch.cat([v,v], dim=1)
+                        merged_state_dict['_qnet.module.proprio_preprocess.linear.weight'] = new_v
+                    else:
+                        merged_state_dict[k] = v
+                elif k == '_qnet.module.proprio_preprocess.linear.bias':
+                    merged_state_dict['_qnet.module.proprio_preprocess.linear.bias'] = v
+                # pos_with_lang
+                elif k == "_qnet.module.pos_encoding":
+                    if (v.shape[1] != 8077 or v.shape[1] != 8154) and v.shape[1] < 154:
+                        if self.anybimanual:
+                            lang_max_seq_len = 154
+                        else:
+                            lang_max_seq_len = 77
+                        spatial_size = v.shape[1]
+                        input_dim_before_seq = v.shape[-1]
+                        flattened_v = v.view(1, -1, input_dim_before_seq)  # (1, spatial_size**3, self.input_dim_before_seq)
+                        new_pos_encoding = torch.randn(1, lang_max_seq_len, input_dim_before_seq, device=device)
+                        merged_pos_encoding = torch.cat([flattened_v, new_pos_encoding], dim=1)  # (1, lang_max_seq_len + spatial_size**3, self.input_dim_before_seq)
+                        merged_state_dict["_qnet.module.pos_encoding"] = merged_pos_encoding
+                    else:
+                        merged_state_dict["_qnet.module.pos_encoding"] = v
+                elif k in merged_state_dict:
+                    merged_state_dict[k] = v
+                # else:
+                #     if "_voxelizer" not in k:
+                #         logging.warning("key %s not found in checkpoint" % k)
+        if not self._training:
+            b = merged_state_dict["_voxelizer._ones_max_coords"].shape[0]
+            merged_state_dict["_voxelizer._ones_max_coords"] = merged_state_dict[
+                "_voxelizer._ones_max_coords"
+            ][0:1]
+            flat_shape = merged_state_dict["_voxelizer._flat_output"].shape[0]
+            merged_state_dict["_voxelizer._flat_output"] = merged_state_dict[
+                "_voxelizer._flat_output"
+            ][0 : flat_shape // b]
+            merged_state_dict["_voxelizer._tiled_batch_indices"] = merged_state_dict[
+                "_voxelizer._tiled_batch_indices"
+            ][0:1]
+            merged_state_dict["_voxelizer._index_grid"] = merged_state_dict[
+                "_voxelizer._index_grid"
+            ][0:1]
+        self._q.load_state_dict(merged_state_dict)
+        if self.frozen:
+            print("Freezing parameters from PerAct")
+            for name, param in self._q.named_parameters():
+                if name in state_dict:
+                    param.requires_grad = False
+        logging.info(
+            "# Q Params: %d"
+            % sum(
+                p.numel()
+                for name, p in self._q.named_parameters()
+                if p.requires_grad and "clip" not in name
+            )
+        )
+        print("loaded weights from %s" % weight_file)
+    def save_weights(self, savedir: str):
+        torch.save(self._q.state_dict(), os.path.join(savedir, "%s.pt" % self._name))

third_party/AnyBimanual/agents/peract_bimanual/qattention_stack_agent.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from typing import List
+import torch
+from yarr.agents.agent import Agent, ActResult, Summary
+import numpy as np
+from helpers import utils
+from agents.peract_bimanual.qattention_peract_bc_agent import QAttentionPerActBCAgent
+NAME = "QAttentionStackAgent"
+class QAttentionStackAgent(Agent):
+    def __init__(
+        self,
+        qattention_agents: List[QAttentionPerActBCAgent],
+        rotation_resolution: float,
+        camera_names: List[str],
+        rotation_prediction_depth: int = 0,
+    ):
+        super(QAttentionStackAgent, self).__init__()
+        self._qattention_agents = qattention_agents
+        self._rotation_resolution = rotation_resolution
+        self._camera_names = camera_names
+        self._rotation_prediction_depth = rotation_prediction_depth
+    def build(self, training: bool, device=None) -> None:
+        self._device = device
+        if self._device is None:
+            self._device = torch.device("cpu")
+        for qa in self._qattention_agents:
+            qa.build(training, device)
+    def update(self, step: int, replay_sample: dict) -> dict:
+        priorities = 0
+        total_losses = 0.0
+        for qa in self._qattention_agents:
+            update_dict = qa.update(step, replay_sample)
+            replay_sample.update(update_dict)
+            total_losses += update_dict["total_loss"]
+        return {
+            "total_losses": total_losses,
+        }
+    def act(self, step: int, observation: dict, deterministic=False) -> ActResult:
+        observation_elements = {}
+        (
+            right_translation_results,
+            right_rot_grip_results,
+            right_ignore_collisions_results,
+        ) = ([], [], [])
+        (
+            left_translation_results,
+            left_rot_grip_results,
+            left_ignore_collisions_results,
+        ) = ([], [], [])
+        infos = {}
+        for depth, qagent in enumerate(self._qattention_agents):
+            act_results = qagent.act(step, observation, deterministic)
+            right_attention_coordinate = (
+                act_results.observation_elements["right_attention_coordinate"]
+                .cpu()
+                .numpy()
+            )
+            left_attention_coordinate = (
+                act_results.observation_elements["left_attention_coordinate"]
+                .cpu()
+                .numpy()
+            )
+            observation_elements[
+                "right_attention_coordinate_layer_%d" % depth
+            ] = right_attention_coordinate[0]
+            observation_elements[
+                "left_attention_coordinate_layer_%d" % depth
+            ] = left_attention_coordinate[0]
+            (
+                right_translation_idxs,
+                right_rot_grip_idxs,
+                right_ignore_collisions_idxs,
+                left_translation_idxs,
+                left_rot_grip_idxs,
+                left_ignore_collisions_idxs,
+            ) = act_results.action
+            right_translation_results.append(right_translation_idxs)
+            if right_rot_grip_idxs is not None:
+                right_rot_grip_results.append(right_rot_grip_idxs)
+            if right_ignore_collisions_idxs is not None:
+                right_ignore_collisions_results.append(right_ignore_collisions_idxs)
+            left_translation_results.append(left_translation_idxs)
+            if left_rot_grip_idxs is not None:
+                left_rot_grip_results.append(left_rot_grip_idxs)
+            if left_ignore_collisions_idxs is not None:
+                left_ignore_collisions_results.append(left_ignore_collisions_idxs)
+            observation[
+                "right_attention_coordinate"
+            ] = act_results.observation_elements["right_attention_coordinate"]
+            observation["left_attention_coordinate"] = act_results.observation_elements[
+                "left_attention_coordinate"
+            ]
+            observation["prev_layer_voxel_grid"] = act_results.observation_elements[
+                "prev_layer_voxel_grid"
+            ]
+            observation["prev_layer_bounds"] = act_results.observation_elements[
+                "prev_layer_bounds"
+            ]
+            for n in self._camera_names:
+                extrinsics = observation["%s_camera_extrinsics" % n][0, 0].cpu().numpy()
+                intrinsics = observation["%s_camera_intrinsics" % n][0, 0].cpu().numpy()
+                px, py = utils.point_to_pixel_index(
+                    right_attention_coordinate[0], extrinsics, intrinsics
+                )
+                pc_t = torch.tensor(
+                    [[[py, px]]], dtype=torch.float32, device=self._device
+                )
+                observation[f"right_{n}_pixel_coord"] = pc_t
+                observation_elements[f"right_{n}_pixel_coord"] = [py, px]
+                px, py = utils.point_to_pixel_index(
+                    left_attention_coordinate[0], extrinsics, intrinsics
+                )
+                pc_t = torch.tensor(
+                    [[[py, px]]], dtype=torch.float32, device=self._device
+                )
+                observation[f"left_{n}_pixel_coord"] = pc_t
+                observation_elements[f"left_{n}_pixel_coord"] = [py, px]
+            infos.update(act_results.info)
+        right_rgai = torch.cat(right_rot_grip_results, 1)[0].cpu().numpy()
+        # ..todo:: utils.correct_rotation_instability does nothing so we can ignore it
+        # right_rgai = utils.correct_rotation_instability(right_rgai, self._rotation_resolution)
+        right_ignore_collisions = (
+            torch.cat(right_ignore_collisions_results, 1)[0].cpu().numpy()
+        )
+        right_trans_action_indicies = (
+            torch.cat(right_translation_results, 1)[0].cpu().numpy()
+        )
+        observation_elements[
+            "right_trans_action_indicies"
+        ] = right_trans_action_indicies[:3]
+        observation_elements["right_rot_grip_action_indicies"] = right_rgai[:4]
+        left_rgai = torch.cat(left_rot_grip_results, 1)[0].cpu().numpy()
+        left_ignore_collisions = (
+            torch.cat(left_ignore_collisions_results, 1)[0].cpu().numpy()
+        )
+        left_trans_action_indicies = (
+            torch.cat(left_translation_results, 1)[0].cpu().numpy()
+        )
+        observation_elements["left_trans_action_indicies"] = left_trans_action_indicies[
+            3:
+        ]
+        observation_elements["left_rot_grip_action_indicies"] = left_rgai[4:]
+        continuous_action = np.concatenate(
+            [
+                right_attention_coordinate[0],
+                utils.discrete_euler_to_quaternion(
+                    right_rgai[-4:-1], self._rotation_resolution
+                ),
+                right_rgai[-1:],
+                right_ignore_collisions,
+                left_attention_coordinate[0],
+                utils.discrete_euler_to_quaternion(
+                    left_rgai[-4:-1], self._rotation_resolution
+                ),
+                left_rgai[-1:],
+                left_ignore_collisions,
+            ]
+        )
+        return ActResult(
+            continuous_action, observation_elements=observation_elements, info=infos
+        )
+    def update_summaries(self) -> List[Summary]:
+        summaries = []
+        for qa in self._qattention_agents:
+            summaries.extend(qa.update_summaries())
+        return summaries
+    def update_wandb_summaries(self):
+        summaries = {}
+        for qa in self._qattention_agents:
+            summaries.update(qa.update_wandb_summaries())
+        return summaries
+    def act_summaries(self) -> List[Summary]:
+        s = []
+        for qa in self._qattention_agents:
+            s.extend(qa.act_summaries())
+        return s
+    def load_weights(self, savedir: str):
+        for qa in self._qattention_agents:
+            print(dir(qa))
+            qa.load_weights(savedir)
+    def save_weights(self, savedir: str):
+        for qa in self._qattention_agents:
+            qa.save_weights(savedir)

third_party/AnyBimanual/agents/peract_bimanual/skill_manager.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+import transformers
+from agents.peract_bimanual.trajectory_gpt2 import GPT2Model
+import torch.nn.functional as F
+class SkillManager(nn.Module):
+    def __init__(
+            self,
+            num_classes,
+            embedding_matrix=None,
+            voxel_dim=128,
+            lang_dim=128,
+            hidden_size=128,
+            output_dim=18,
+            max_voxels=8000,
+            max_lang_tokens=77,
+            **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.output_dim = output_dim
+        # GPT-2 configuration
+        config = transformers.GPT2Config(
+            vocab_size=1,  # not used
+            n_embd=hidden_size,
+            n_head=4,
+            n_ctx=1077,
+        )
+        self.max_voxels = max_voxels
+        self.max_lang_tokens = max_lang_tokens
+        self.embed_voxel = nn.Linear(voxel_dim, hidden_size)
+        self.embed_lang = nn.Linear(lang_dim, hidden_size)
+        self.transformer = GPT2Model(config)
+        self.embed_ln = nn.LayerNorm(hidden_size)
+        self.predict_logits = nn.Linear(hidden_size, output_dim)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.num_class = num_classes
+        if embedding_matrix is not None:
+            self.embeddings_matrix = embedding_matrix.to(self.device)
+    def forward(self, voxel_embedding, language_embedding):
+        batch_size = voxel_embedding.shape[0]
+        voxel_embeddings = self.embed_voxel(voxel_embedding)  # [b, 8000, hidden_size]
+        language_embeddings = self.embed_lang(language_embedding)  # [b, 77, hidden_size]
+        voxel_embeddings = voxel_embeddings.permute(0, 2, 1)  # [b, hidden_size, 8000]
+        voxel_embeddings = F.avg_pool1d(voxel_embeddings, kernel_size=16, stride=16)  # [b, hidden_size, 1000]
+        voxel_embeddings = voxel_embeddings.permute(0, 2, 1)  # [b, 1000, hidden_size]
+        inputs = torch.cat([language_embeddings, voxel_embeddings], dim=1)  # [b, 8077, hidden_size]
+        stacked_inputs = self.embed_ln(inputs)
+        attention_mask = torch.ones(
+            (batch_size, self.max_lang_tokens + self.max_voxels),
+            device=voxel_embedding.device,
+            dtype=torch.long  # Ensure correct dtype
+        )
+        assert torch.isfinite(attention_mask).all(), "attention_mask contains NaN or Inf"
+        assert torch.all((attention_mask == 1)), "attention_mask contains values not equal to 1"
+        transformer_outputs = self.transformer(
+            inputs_embeds=stacked_inputs,
+            attention_mask=None,
+        )
+        hidden_state = transformer_outputs.last_hidden_state  # [b, 8077, hidden_size]
+        aggregated_hidden = hidden_state.mean(dim=1)  # [b, hidden_size]
+        logits = self.predict_logits(aggregated_hidden)  # [b, output_dim]
+        probs = F.softmax(logits, dim=1)
+        skill = torch.matmul(probs, self.embeddings_matrix.to(probs.device))
+        skill = skill.view(-1,77,512)
+        return skill

third_party/AnyBimanual/agents/peract_bimanual/trajectory_gpt2.py ADDED Viewed

	@@ -0,0 +1,775 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.modeling_utils import (
+    Conv1D,
+    PreTrainedModel,
+    SequenceSummary,
+    find_pruneable_heads_and_indices,
+    prune_conv1d_layer,
+)
+from transformers.utils import logging
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "GPT2Config"
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "distilgpt2",
+    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
+]
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        try:
+            assert (
+                    pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
+        super().__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer(
+            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.is_cross_attention = is_cross_attention
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * n_state, nx)
+            self.q_attn = Conv1D(n_state, nx)
+        else:
+            self.c_attn = Conv1D(3 * n_state, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
+        )
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / (float(v.size(-1)) ** 0.5)
+        nd, ns = w.size(-2), w.size(-1)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            mask = self.bias[:, :, ns - nd: ns, :ns]
+            w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+        outputs = [torch.matmul(w, v)]
+        if output_attentions:
+            outputs.append(w)
+        return outputs
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def forward(
+            self,
+            hidden_states,
+            layer_past=None,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=False,
+            output_attentions=False,
+    ):
+        if encoder_hidden_states is not None:
+            assert hasattr(
+                self, "q_attn"
+            ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        else:
+            present = (None,)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
+        a = attn_outputs[0]
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super().__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+class AdapterMLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super().__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super().__init__()
+        hidden_size = config.n_embd
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = Attention(hidden_size, n_ctx, config, scale)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # self.adapter_ln = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        if config.add_cross_attention:
+            self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(inner_dim, config)
+        # self.adapter_mlp = AdapterMLP(512, config)  # ADAPTER
+    def forward(
+            self,
+            hidden_states,
+            layer_past=None,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=False,
+            output_attentions=False,
+    ):
+        attn_outputs = self.attn(
+            self.ln_1(hidden_states),
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + hidden_states
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+            cross_attn_outputs = self.crossattention(
+                self.ln_cross_attn(hidden_states),
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = hidden_states + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+        feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
+        # residual connection
+        hidden_states = hidden_states + feed_forward_hidden_states
+        # hidden_states = hidden_states + self.adapter_ln(self.adapter_mlp(hidden_states))
+        outputs = [hidden_states] + outputs
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+class GPT2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = GPT2Config
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+            # module.weight.data.fill_(.01)  # KL: Adapter change
+@dataclass
+class GPT2DoubleHeadsModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
+            Language modeling loss.
+        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
+            Multiple choice classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
+            batch_size, num_heads, sequence_length, embed_size_per_head)`).
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    mc_loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    mc_logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+GPT2_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
+            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
+            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
+            passed as ``input_ids``.
+            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
+            have their past given to this model should not be passed as ``input_ids`` as they have already been
+            computed.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+            If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
+            :obj:`past_key_values`).
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+PARALLELIZE_DOCSTRING = r"""
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+    Args:
+        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
+            following number of attention modules:
+                - gpt2: 12
+                - gpt2-medium: 24
+                - gpt2-large: 36
+                - gpt2-xl: 48
+    Example::
+            # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+            model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
+            device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+                          1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+                          2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+                          3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
+            model.parallelize(device_map)
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+    Example::
+        # On a 4 GPU machine with gpt2-large:
+        model = GPT2LMHeadModel.from_pretrained('gpt2-large')
+        device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
+                    1: [8, 9, 10, 11, 12, 13, 14, 15],
+                    2: [16, 17, 18, 19, 20, 21, 22, 23],
+                    3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
+        model.parallelize(device_map) # Splits the model across several devices
+        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+"""
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class GPT2Model(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        # self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.init_weights()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.use_layers = None
+    def set_layers(self, num_layers):
+        assert 1 <= num_layers <= len(self.h)
+        if num_layers is not None:
+            num_layers -= 1
+        self.use_layers = num_layers
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        self.wpe = self.wpe.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="gpt2",
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids=None,
+            past_key_values=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        # Attention mask.
+        if attention_mask is not None:
+            assert batch_size > 0, "batch_size has to be defined and > 0"
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        # position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds  # + position_embeds
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if self.use_layers is not None and i >= self.use_layers:
+                break
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = layer_past.to(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if getattr(self.config, "gradient_checkpointing", False):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # checkpointing only works with tuple returns, not with lists
+                        return tuple(output for output in module(*inputs, use_cache, output_attentions))
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    layer_past,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states, present = outputs[:2]
+            if use_cache is True:
+                presents = presents + (present,)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )

third_party/AnyBimanual/agents/peract_bimanual/visual_aligner.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class VisualAligner(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256, mask_dim=128):
+        super(VisualAligner, self).__init__()
+        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv_res1 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv_res2 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv2_right = nn.Conv1d(in_channels=hidden_dim, out_channels=mask_dim, kernel_size=3, padding=1)
+        self.conv2_left = nn.Conv1d(in_channels=hidden_dim, out_channels=mask_dim, kernel_size=3, padding=1)
+        self.activation = nn.ReLU()
+    def forward(self, ins):
+        ins = ins.transpose(1, 2)
+        features = self.activation(self.conv1(ins))
+        residual = features
+        features = self.activation(self.conv_res1(features))
+        features = self.conv_res2(features)
+        features = features + residual
+        mask_right = self.activation(self.conv2_right(features))
+        mask_left = self.activation(self.conv2_left(features))
+        mask_right = mask_right.transpose(1, 2)
+        mask_left = mask_left.transpose(1, 2)
+        ins = ins.transpose(1, 2)
+        masked_ins1 = ins * mask_right
+        masked_ins2 = ins * mask_left
+        return masked_ins1, masked_ins2

third_party/AnyBimanual/agents/replay_utils.py ADDED Viewed

	@@ -0,0 +1,667 @@

+import logging
+from typing import List
+import os
+import numpy as np
+from rlbench.backend.observation import Observation
+from rlbench.observation_config import ObservationConfig
+import rlbench.utils as rlbench_utils
+from rlbench.demo import Demo
+from yarr.replay_buffer.replay_buffer import ReplayBuffer
+from helpers import demo_loading_utils, utils
+from helpers import observation_utils
+from helpers.clip.core.clip import tokenize
+from yarr.replay_buffer.prioritized_replay_buffer import ObservationElement
+from yarr.replay_buffer.replay_buffer import ReplayElement
+from yarr.replay_buffer.task_uniform_replay_buffer import TaskUniformReplayBuffer
+import torch
+from torch.multiprocessing import Process, Value, Manager
+from helpers.clip.core.clip import build_model, load_clip
+from omegaconf import DictConfig
+REWARD_SCALE = 100.0
+LOW_DIM_SIZE = 4
+def create_replay(cfg, replay_path):
+    if cfg.method.robot_name == "bimanual":
+        return create_bimanual_replay(
+            cfg.replay.batch_size,
+            cfg.replay.timesteps,
+            cfg.replay.prioritisation,
+            cfg.replay.task_uniform,
+            replay_path if cfg.replay.use_disk else None,
+            cfg.rlbench.cameras,
+            cfg.method.voxel_sizes,
+            cfg.rlbench.camera_resolution,
+        )
+    else:
+        return create_unimanual_replay(
+            cfg.replay.batch_size,
+            cfg.replay.timesteps,
+            cfg.replay.prioritisation,
+            cfg.replay.task_uniform,
+            replay_path if cfg.replay.use_disk else None,
+            cfg.rlbench.cameras,
+            cfg.method.voxel_sizes,
+            cfg.rlbench.camera_resolution,
+        )
+def create_bimanual_replay(
+    batch_size: int,
+    timesteps: int,
+    prioritisation: bool,
+    task_uniform: bool,
+    save_dir: str,
+    cameras: list,
+    voxel_sizes,
+    image_size=[128, 128],
+    replay_size=3e5,
+):
+    trans_indicies_size = 3 * len(voxel_sizes)
+    rot_and_grip_indicies_size = 3 + 1
+    gripper_pose_size = 7
+    ignore_collisions_size = 1
+    max_token_seq_len = 77
+    lang_feat_dim = 1024
+    lang_emb_dim = 512
+    # low_dim_state
+    observation_elements = []
+    observation_elements.append(
+        ObservationElement("right_low_dim_state", (LOW_DIM_SIZE,), np.float32)
+    )
+    observation_elements.append(
+        ObservationElement("left_low_dim_state", (LOW_DIM_SIZE,), np.float32)
+    )
+    # rgb, depth, point cloud, intrinsics, extrinsics
+    for cname in cameras:
+        observation_elements.append(
+            # color, height, width
+            ObservationElement(
+                "%s_rgb" % cname,
+                (
+                    3,
+                    image_size[1],
+                    image_size[0],
+                ),
+                np.float32,
+            )
+        )
+        observation_elements.append(
+            ObservationElement("%s_point_cloud" % cname, (3, image_size[1], image_size[0]), np.float16)
+        )  # see pyrep/objects/vision_sensor.py on how pointclouds are extracted from depth frames
+        observation_elements.append(
+            ObservationElement(
+                "%s_camera_extrinsics" % cname,
+                (
+                    4,
+                    4,
+                ),
+                np.float32,
+            )
+        )
+        observation_elements.append(
+            ObservationElement(
+                "%s_camera_intrinsics" % cname,
+                (
+                    3,
+                    3,
+                ),
+                np.float32,
+            )
+        )
+    # discretized translation, discretized rotation, discrete ignore collision, 6-DoF gripper pose, and pre-trained language embeddings
+    for robot_name in ["right", "left"]:
+        observation_elements.extend(
+            [
+                ReplayElement(
+                    f"{robot_name}_trans_action_indicies",
+                    (trans_indicies_size,),
+                    np.int32,
+                ),
+                ReplayElement(
+                    f"{robot_name}_rot_grip_action_indicies",
+                    (rot_and_grip_indicies_size,),
+                    np.int32,
+                ),
+                ReplayElement(
+                    f"{robot_name}_ignore_collisions",
+                    (ignore_collisions_size,),
+                    np.int32,
+                ),
+                ReplayElement(
+                    f"{robot_name}_gripper_pose", (gripper_pose_size,), np.float32
+                ),
+            ]
+        )
+    observation_elements.extend(
+        [
+            ReplayElement("lang_goal_emb", (lang_feat_dim,), np.float32),
+            ReplayElement(
+                "lang_token_embs",
+                (
+                    max_token_seq_len,
+                    lang_emb_dim,
+                ),
+                np.float32,
+            ),  # extracted from CLIP's language encoder
+            ReplayElement("task", (), str),
+            ReplayElement(
+                "lang_goal", (1,), object
+            ),  # language goal string for debugging and visualization
+        ]
+    )
+    extra_replay_elements = [
+        ReplayElement("demo", (), bool),
+    ]
+    replay_buffer = TaskUniformReplayBuffer(
+        save_dir=save_dir,
+        batch_size=batch_size,
+        timesteps=timesteps,
+        replay_capacity=int(replay_size),
+        action_shape=(8 * 2,),
+        action_dtype=np.float32,
+        reward_shape=(),
+        reward_dtype=np.float32,
+        update_horizon=1,
+        observation_elements=observation_elements,
+        extra_replay_elements=extra_replay_elements,
+    )
+    return replay_buffer
+def create_unimanual_replay(
+    batch_size: int,
+    timesteps: int,
+    prioritisation: bool,
+    task_uniform: bool,
+    save_dir: str,
+    cameras: list,
+    voxel_sizes,
+    image_size=[128, 128],
+    replay_size=3e5,
+):
+    trans_indicies_size = 3 * len(voxel_sizes)
+    rot_and_grip_indicies_size = 3 + 1
+    gripper_pose_size = 7
+    ignore_collisions_size = 1
+    max_token_seq_len = 77
+    lang_feat_dim = 1024
+    lang_emb_dim = 512
+    # low_dim_state
+    observation_elements = []
+    observation_elements.append(
+        ObservationElement("low_dim_state", (LOW_DIM_SIZE,), np.float32)
+    )
+    # rgb, depth, point cloud, intrinsics, extrinsics
+    for cname in cameras:
+        observation_elements.append(
+            ObservationElement(
+                "%s_rgb" % cname,
+                (
+                    3,
+                    *image_size,
+                ),
+                np.float32,
+            )
+        )
+        observation_elements.append(
+            ObservationElement("%s_point_cloud" % cname, (3, *image_size), np.float32)
+        )  # see pyrep/objects/vision_sensor.py on how pointclouds are extracted from depth frames
+        observation_elements.append(
+            ObservationElement(
+                "%s_camera_extrinsics" % cname,
+                (
+                    4,
+                    4,
+                ),
+                np.float32,
+            )
+        )
+        observation_elements.append(
+            ObservationElement(
+                "%s_camera_intrinsics" % cname,
+                (
+                    3,
+                    3,
+                ),
+                np.float32,
+            )
+        )
+    # discretized translation, discretized rotation, discrete ignore collision, 6-DoF gripper pose, and pre-trained language embeddings
+    observation_elements.extend(
+        [
+            ReplayElement("trans_action_indicies", (trans_indicies_size,), np.int32),
+            ReplayElement(
+                "rot_grip_action_indicies", (rot_and_grip_indicies_size,), np.int32
+            ),
+            ReplayElement("ignore_collisions", (ignore_collisions_size,), np.int32),
+            ReplayElement("gripper_pose", (gripper_pose_size,), np.float32),
+            ReplayElement("lang_goal_emb", (lang_feat_dim,), np.float32),
+            ReplayElement(
+                "lang_token_embs",
+                (
+                    max_token_seq_len,
+                    lang_emb_dim,
+                ),
+                np.float32,
+            ),  # extracted from CLIP's language encoder
+            ReplayElement("task", (), str),
+            ReplayElement(
+                "lang_goal", (1,), object
+            ),  # language goal string for debugging and visualization
+        ]
+    )
+    extra_replay_elements = [
+        ReplayElement("demo", (), bool),
+    ]
+    replay_buffer = TaskUniformReplayBuffer(
+        save_dir=save_dir,
+        batch_size=batch_size,
+        timesteps=timesteps,
+        replay_capacity=int(replay_size),
+        action_shape=(8,),
+        action_dtype=np.float32,
+        reward_shape=(),
+        reward_dtype=np.float32,
+        update_horizon=1,
+        observation_elements=observation_elements,
+        extra_replay_elements=extra_replay_elements,
+    )
+    return replay_buffer
+def _get_action(
+    obs_tp1: Observation,
+    obs_tm1: Observation,
+    rlbench_scene_bounds: List[float],  # metric 3D bounds of the scene
+    voxel_sizes: List[int],
+    bounds_offset: List[float],
+    rotation_resolution: int,
+    crop_augmentation: bool,
+):
+    quat = utils.normalize_quaternion(obs_tp1.gripper_pose[3:])
+    if quat[-1] < 0:
+        quat = -quat
+    disc_rot = utils.quaternion_to_discrete_euler(quat, rotation_resolution)
+    disc_rot = utils.correct_rotation_instability(disc_rot, rotation_resolution)
+    attention_coordinate = obs_tp1.gripper_pose[:3]
+    trans_indicies, attention_coordinates = [], []
+    bounds = np.array(rlbench_scene_bounds)
+    ignore_collisions = int(obs_tm1.ignore_collisions)
+    for depth, vox_size in enumerate(
+        voxel_sizes
+    ):  # only single voxelization-level is used in PerAct
+        if depth > 0:
+            if crop_augmentation:
+                shift = bounds_offset[depth - 1] * 0.75
+                attention_coordinate += np.random.uniform(-shift, shift, size=(3,))
+            bounds = np.concatenate(
+                [
+                    attention_coordinate - bounds_offset[depth - 1],
+                    attention_coordinate + bounds_offset[depth - 1],
+                ]
+            )
+        index = utils.point_to_voxel_index(obs_tp1.gripper_pose[:3], vox_size, bounds)
+        trans_indicies.extend(index.tolist())
+        res = (bounds[3:] - bounds[:3]) / vox_size
+        attention_coordinate = bounds[:3] + res * index
+        attention_coordinates.append(attention_coordinate)
+    rot_and_grip_indicies = disc_rot.tolist()
+    grip = float(obs_tp1.gripper_open)
+    rot_and_grip_indicies.extend([int(obs_tp1.gripper_open)])
+    return (
+        trans_indicies,
+        rot_and_grip_indicies,
+        ignore_collisions,
+        np.concatenate([obs_tp1.gripper_pose, np.array([grip])]),
+        attention_coordinates,
+    )
+def _add_keypoints_to_replay(
+    cfg: DictConfig,
+    task: str,
+    replay: ReplayBuffer,
+    inital_obs: Observation,
+    demo: Demo,
+    episode_keypoints: List[int],
+    description: str = "",
+    clip_model=None,
+    device="cpu",
+):
+    cameras = cfg.rlbench.cameras
+    rlbench_scene_bounds = cfg.rlbench.scene_bounds
+    voxel_sizes = cfg.method.voxel_sizes
+    bounds_offset = cfg.method.bounds_offset
+    rotation_resolution = cfg.method.rotation_resolution
+    crop_augmentation = cfg.method.crop_augmentation
+    robot_name = cfg.method.robot_name
+    prev_action = None
+    obs = inital_obs
+    for k, keypoint in enumerate(episode_keypoints):
+        obs_tp1 = demo[keypoint]
+        obs_tm1 = demo[max(0, keypoint - 1)]
+        if obs_tp1.is_bimanual and robot_name == "bimanual":
+            #assert isinstance(obs_tp1, BimanualObservation)
+            (
+                right_trans_indicies,
+                right_rot_grip_indicies,
+                right_ignore_collisions,
+                right_action,
+                right_attention_coordinates,
+            ) = _get_action(
+                obs_tp1.right,
+                obs_tm1.right,
+                rlbench_scene_bounds,
+                voxel_sizes,
+                bounds_offset,
+                rotation_resolution,
+                crop_augmentation,
+            )
+            (
+                left_trans_indicies,
+                left_rot_grip_indicies,
+                left_ignore_collisions,
+                left_action,
+                left_attention_coordinates,
+            ) = _get_action(
+                obs_tp1.left,
+                obs_tm1.left,
+                rlbench_scene_bounds,
+                voxel_sizes,
+                bounds_offset,
+                rotation_resolution,
+                crop_augmentation,
+            )
+            action = np.append(right_action, left_action)
+            right_ignore_collisions = np.array([right_ignore_collisions])
+            left_ignore_collisions = np.array([left_ignore_collisions])
+        elif robot_name == "unimanual":
+            (
+                trans_indicies,
+                rot_grip_indicies,
+                ignore_collisions,
+                action,
+                attention_coordinates,
+            ) = _get_action(
+                obs_tp1,
+                obs_tm1,
+                rlbench_scene_bounds,
+                voxel_sizes,
+                bounds_offset,
+                rotation_resolution,
+                crop_augmentation,
+            )
+            gripper_pose = obs_tp1.gripper_pose
+        elif obs_tp1.is_bimanual and robot_name == "right":
+            (
+                trans_indicies,
+                rot_grip_indicies,
+                ignore_collisions,
+                action,
+                attention_coordinates,
+            ) = _get_action(
+                obs_tp1.right,
+                obs_tm1.right,
+                rlbench_scene_bounds,
+                voxel_sizes,
+                bounds_offset,
+                rotation_resolution,
+                crop_augmentation,
+            )
+            gripper_pose = obs_tp1.right.gripper_pose
+        elif obs_tp1.is_bimanual and robot_name == "left":
+            (
+                trans_indicies,
+                rot_grip_indicies,
+                ignore_collisions,
+                action,
+                attention_coordinates,
+            ) = _get_action(
+                obs_tp1.left,
+                obs_tm1.left,
+                rlbench_scene_bounds,
+                voxel_sizes,
+                bounds_offset,
+                rotation_resolution,
+                crop_augmentation,
+            )
+            gripper_pose = obs_tp1.left.gripper_pose
+        else:
+            logging.error("Invalid robot name %s", cfg.method.robot_name)
+            raise Exception("Invalid robot name.")
+        terminal = k == len(episode_keypoints) - 1
+        reward = float(terminal) * REWARD_SCALE if terminal else 0
+        obs_dict = observation_utils.extract_obs(
+            obs,
+            t=k,
+            prev_action=prev_action,
+            cameras=cameras,
+            episode_length=cfg.rlbench.episode_length,
+            robot_name=robot_name
+        )
+        tokens = tokenize([description]).numpy()
+        token_tensor = torch.from_numpy(tokens).to(device)
+        sentence_emb, token_embs = clip_model.encode_text_with_embeddings(token_tensor)
+        obs_dict["lang_goal_emb"] = sentence_emb[0].float().detach().cpu().numpy()
+        obs_dict["lang_token_embs"] = token_embs[0].float().detach().cpu().numpy()
+        prev_action = np.copy(action)
+        others = {"demo": True}
+        if robot_name == "bimanual":
+            final_obs = {
+                "right_trans_action_indicies": right_trans_indicies,
+                "right_rot_grip_action_indicies": right_rot_grip_indicies,
+                "right_gripper_pose": obs_tp1.right.gripper_pose,
+                "left_trans_action_indicies": left_trans_indicies,
+                "left_rot_grip_action_indicies": left_rot_grip_indicies,
+                "left_gripper_pose": obs_tp1.left.gripper_pose,
+                "task": task,
+                "lang_goal": np.array([description], dtype=object),
+            }
+        else:
+            final_obs = {
+                "trans_action_indicies": trans_indicies,
+                "rot_grip_action_indicies": rot_grip_indicies,
+                "gripper_pose": gripper_pose,
+                "task": task,
+                "lang_goal": np.array([description], dtype=object),
+            }
+        others.update(final_obs)
+        others.update(obs_dict)
+        timeout = False
+        replay.add(action, reward, terminal, timeout, **others)
+        obs = obs_tp1
+    # final step
+    obs_dict_tp1 = observation_utils.extract_obs(
+        obs_tp1,
+        t=k + 1,
+        prev_action=prev_action,
+        cameras=cameras,
+        episode_length=cfg.rlbench.episode_length,
+        robot_name=cfg.method.robot_name
+    )
+    obs_dict_tp1["lang_goal_emb"] = sentence_emb[0].float().detach().cpu().numpy()
+    obs_dict_tp1["lang_token_embs"] = token_embs[0].float().detach().cpu().numpy()
+    obs_dict_tp1.pop("wrist_world_to_cam", None)
+    obs_dict_tp1.update(final_obs)
+    replay.add_final(**obs_dict_tp1)
+def check_if_replay_exists(task: str, d_idx: int, replay_path: str):
+    replay_file = os.path.join(replay_path, f"{task}_replay_{d_idx}.pkl")
+    return os.path.exists(replay_file)
+def fill_replay(
+    cfg: DictConfig,
+    obs_config: ObservationConfig,
+    rank: int,
+    replay: ReplayBuffer,
+    task: str,
+    clip_model=None,
+    device="cpu",
+):
+    num_demos=cfg.rlbench.demos
+    demo_augmentation=cfg.method.demo_augmentation
+    demo_augmentation_every_n=cfg.method.demo_augmentation_every_n
+    keypoint_method=cfg.method.keypoint_method
+    if clip_model is None:
+        model, _ = load_clip("RN50", jit=False, device=device)
+        clip_model = build_model(model.state_dict())
+        clip_model.to(device)
+        del model
+    task_folder = cfg.replay.task_folder
+    replay_path = os.path.join(
+        cfg.replay.path, task_folder
+    )
+    logging.debug("Filling %s replay ..." % task)
+    for d_idx in range(num_demos):
+        # load demo from disk
+        if check_if_replay_exists(task, d_idx, replay_path):
+            logging.info(f"Replay for demo {d_idx} already exists, skipping...")
+            continue
+        demo = rlbench_utils.get_stored_demos(
+            amount=1,
+            image_paths=False,
+            dataset_root=cfg.rlbench.demo_path,
+            variation_number=-1,
+            task_name=task,
+            obs_config=obs_config,
+            random_selection=False,
+            from_episode_number=d_idx,
+        )[0]
+        descs = demo._observations[0].misc["descriptions"]
+        # extract keypoints (a.k.a keyframes)
+        episode_keypoints = demo_loading_utils.keypoint_discovery(
+            demo, method=keypoint_method
+        )
+        if rank == 0:
+            logging.info(
+                f"Loading Demo({d_idx}) - found {len(episode_keypoints)} keypoints - {task}"
+            )
+        for i in range(len(demo) - 1):
+            if not demo_augmentation and i > 0:
+                break
+            if i % demo_augmentation_every_n != 0:
+                continue
+            obs = demo[i]
+            desc = descs[0]
+            # if our starting point is past one of the keypoints, then remove it
+            while len(episode_keypoints) > 0 and i >= episode_keypoints[0]:
+                episode_keypoints = episode_keypoints[1:]
+            if len(episode_keypoints) == 0:
+                break
+            _add_keypoints_to_replay(
+                cfg,
+                task,
+                replay,
+                obs,
+                demo,
+                episode_keypoints,
+                description=desc,
+                clip_model=clip_model,
+                device=device,
+            )
+    logging.debug("Replay %s filled with demos." % task)
+def fill_multi_task_replay(
+    cfg: DictConfig,
+    obs_config: ObservationConfig,
+    rank: int,
+    replay: ReplayBuffer,
+    tasks: List[str],
+    clip_model=None,
+):
+    tasks = cfg.rlbench.tasks
+    manager = Manager()
+    store = manager.dict()
+    # create a MP dict for storing indicies
+    # TODO(mohit): this shouldn't be initialized here
+    del replay._task_idxs
+    task_idxs = manager.dict()
+    replay._task_idxs = task_idxs
+    replay._create_storage(store)
+    replay.add_count = Value("i", 0)
+    # fill replay buffer in parallel across tasks
+    max_parallel_processes = cfg.replay.max_parallel_processes
+    processes = []
+    n = np.arange(len(tasks))
+    split_n = utils.split_list(n, max_parallel_processes)
+    for split in split_n:
+        for e_idx, task_idx in enumerate(split):
+            task = tasks[int(task_idx)]
+            model_device = torch.device(
+                "cuda:%s" % (e_idx % torch.cuda.device_count())
+                if torch.cuda.is_available()
+                else "cpu"
+            )
+            p = Process(
+                target=fill_replay,
+                args=(
+                    cfg,
+                    obs_config,
+                    rank,
+                    replay,
+                    task,
+                    clip_model,
+                    model_device
+                ),
+            )
+            p.start()
+            processes.append(p)
+        for p in processes:
+            p.join()

third_party/AnyBimanual/agents/rvt/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""RVT package marker.
+Keep package import side-effect free so downstream code can import the
+visual stack without pulling in the full training launcher and its
+optional dependencies.
+"""

third_party/AnyBimanual/agents/rvt/launch_utils.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+from typing import List
+import torch
+import numpy as np
+from omegaconf import DictConfig
+from yarr.agents.agent import Agent
+from yarr.agents.agent import ActResult
+from yarr.agents.agent import Summary
+from yarr.agents.agent import ScalarSummary
+import wandb
+from torch.nn.parallel import DistributedDataParallel as DDP
+import pickle
+from helpers.preprocess_agent import PreprocessAgent
+from agents.rvt.rvt.models.skill_manager import SkillManager
+from agents.rvt.rvt.models.visual_aligner import VisualAligner
+from agents.rvt.rvt.mvt.mvt import MVT
+from agents.rvt.rvt.models import rvt_agent
+from agents.rvt.rvt.utils.peract_utils import (
+    CAMERAS,
+    SCENE_BOUNDS,
+    IMAGE_SIZE,
+    DATA_FOLDER,
+)
+import agents.rvt.rvt.config as exp_cfg_mod
+import agents.rvt.rvt.models.rvt_agent as rvt_agent
+import agents.rvt.rvt.mvt.config as mvt_cfg_mod
+import os
+def create_agent(cfg: DictConfig):
+    exp_cfg = exp_cfg_mod.get_cfg_defaults()
+    exp_cfg.bs = cfg.replay.batch_size
+    exp_cfg.tasks = ','.join(cfg.rlbench.tasks)
+    exp_cfg.freeze()
+    mvt_cfg = mvt_cfg_mod.get_cfg_defaults()
+    mvt_cfg.proprio_dim = cfg.method.low_dim_size
+    mvt_cfg.freeze()
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    pkl_path = os.path.join(current_dir, "../../lang_token.pkl")
+    pkl_path = os.path.abspath(pkl_path)
+    with open(pkl_path, "rb") as f:
+        embeddings_dict = pickle.load(f)
+    flattened_embeddings = []
+    for key in embeddings_dict.keys():
+        embedding = torch.tensor(embeddings_dict[key])
+        flattened_embedding = embedding.view(-1)
+        flattened_embeddings.append(flattened_embedding)
+    embeddings_matrix = torch.stack(flattened_embeddings)
+    skill_manager = SkillManager(num_classes=18,embedding_matrix=embeddings_matrix)
+    visual_aligner = VisualAligner()
+    agent = RVTAgentWrapper(cfg.framework.checkpoint_name_prefix, cfg.rlbench, mvt_cfg, exp_cfg, skill_manager, visual_aligner)
+    preprocess_agent = PreprocessAgent(pose_agent=agent)
+    return preprocess_agent
+class RVTAgentWrapper(Agent):
+    def __init__(self, checkpoint_name_prefix, rlbench_cfg, mvt_cfg, exp_cfg, skill_manager, visual_aligner):
+        self._checkpoint_filename = f"{checkpoint_name_prefix}.pt"
+        self.rvt_agent = None
+        self.rlbench_cfg = rlbench_cfg
+        self.mvt_cfg = mvt_cfg
+        self.exp_cfg = exp_cfg
+        self._summaries = {}
+        self.skill_manager = skill_manager
+        self.visual_aligner = visual_aligner
+    def build(self, training: bool, device=None) -> None:
+        import torch
+        torch.cuda.set_device(device)
+        torch.cuda.empty_cache()
+        self._device = device
+        if isinstance(device, int):
+            device = f"cuda:{device}"
+        rvt = MVT(
+            renderer_device=device,
+            **self.mvt_cfg,
+        )
+        rvt = rvt.to(device)
+        if training:
+            rvt = DDP(rvt, device_ids=[device])
+        self.rvt_agent = rvt_agent.RVTAgent(
+            network=rvt,
+            #image_resolution=self.rlbench_cfg.camera_resolution,
+            skill_manager=self.skill_manager,
+            visual_aligner=self.visual_aligner,
+            stage_two=False,
+            add_lang=self.mvt_cfg.add_lang,
+            scene_bounds=self.rlbench_cfg.scene_bounds,
+            cameras=self.rlbench_cfg.cameras,
+            log_dir="/tmp/eval_run",
+            **self.exp_cfg.peract,
+            **self.exp_cfg.rvt,
+        )
+        self.rvt_agent.build(training, device)
+    def update(self, step: int, replay_sample: dict) -> dict:
+        for k, v in replay_sample.items():
+            replay_sample[k] = v.unsqueeze(1)
+        # RVT is based on the PerAct's Colab version.
+        replay_sample["lang_goal_embs"] = replay_sample["lang_token_embs"]
+        replay_sample["tasks"] = self.exp_cfg.tasks.split(',')
+        update_dict = self.rvt_agent.update(step, replay_sample)
+        for key, val in self.rvt_agent.loss_log.items():
+            self._summaries[key] = np.mean(np.array(val))
+        device = self._device
+        rank = device
+        if step % 10 == 0 and rank == 0:
+            wandb.log({
+                'train/grip_loss': update_dict["grip_loss"],
+                'train/trans_loss': update_dict["trans_loss"],
+                'train/rot_loss': (update_dict["rot_loss_x"]+update_dict["rot_loss_y"]+update_dict["rot_loss_z"]),
+                'train/collision_loss': update_dict["collision_loss"],
+                'train/total_loss': update_dict["total_loss"],
+            }, step=step)
+        self._wandb_summaries = {
+                'losses/grip_loss': update_dict["grip_loss"],
+                'losses/trans_loss': update_dict["trans_loss"],
+                'losses/rot_loss': (update_dict["rot_loss_x"]+update_dict["rot_loss_y"]+update_dict["rot_loss_z"]),
+                'losses/collision_loss': update_dict["collision_loss"],
+                'losses/total_loss': update_dict["total_loss"],
+        }
+        return {
+            "total_losses": update_dict["total_loss"],
+        }
+        return result
+    def act(self, step: int, observation: dict, deterministic: bool) -> ActResult:
+        return self.rvt_agent.act(step, observation, deterministic)
+    def reset(self) -> None:
+        self.rvt_agent.reset()
+    def update_summaries(self) -> List[Summary]:
+        summaries = []
+        for k, v in self._summaries.items():
+            summaries.append(ScalarSummary(f"RVT/{k}", v))
+        return summaries
+    def update_wandb_summaries(self):
+        summaries = dict()
+        for k, v in self._wandb_summaries.items():
+            summaries[k] = v
+        return summaries
+    def act_summaries(self) -> List[Summary]:
+        return []
+    def load_weights(self, savedir: str) -> None:
+        """
+        copied from RVT
+        """
+        device = torch.device("cuda:0")
+        weight_file = os.path.join(savedir, self._checkpoint_filename)
+        state_dict = torch.load(weight_file, map_location=device)
+        skill = self.rvt_agent.skill_manager
+        visual_aligner = self.rvt_agent.visual_aligner
+        model = self.rvt_agent._network
+        optimizer = self.rvt_agent._optimizer
+        lr_sched = self.rvt_agent._lr_sched
+        if isinstance(model, DDP):
+            model = model.module
+        model.load_state_dict(state_dict["model_state"])
+        optimizer.load_state_dict(state_dict["optimizer_state"])
+        lr_sched.load_state_dict(state_dict["lr_sched_state"])
+        return self.rvt_agent.load_clip()
+    def save_weights(self, savedir: str) -> None:
+        os.makedirs(savedir, exist_ok=True)
+        weight_file = os.path.join(savedir, self._checkpoint_filename)
+        skill = self.rvt_agent.skill_manager
+        visual_aligner = self.rvt_agent.visual_aligner
+        model = self.rvt_agent._network
+        optimizer = self.rvt_agent._optimizer
+        lr_sched = self.rvt_agent._lr_sched
+        if isinstance(model, DDP):
+            model = model.module
+        skill_state = skill.state_dict()
+        visual_aligner_state = visual_aligner.state_dict()
+        model_state = model.state_dict()
+        torch.save(
+            {
+                "skill_state": skill_state,
+                "visual_aligner_state": visual_aligner_state,
+                "model_state": model_state,
+                "optimizer_state": optimizer.state_dict(),
+                "lr_sched_state": lr_sched.state_dict(),
+            },
+            weight_file,
+        )

third_party/AnyBimanual/agents/rvt/rvt/config.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the NVIDIA Source Code License [see LICENSE for details].
+from yacs.config import CfgNode as CN
+_C = CN()
+_C.agent = "our"
+_C.tasks = "insert_onto_square_peg,open_drawer,place_wine_at_rack_location,light_bulb_in"
+_C.exp_id = "def"
+_C.resume = ""
+# bs per device, effective bs is scaled by num device
+_C.bs = 4
+_C.epochs = 20
+# number of dataloader workers, >= 0
+_C.num_workers = 0
+# 'transition_uniform' or 'task_uniform'
+_C.sample_distribution_mode = 'transition_uniform'
+_C.train_iter = 16 * 10000
+# arguments present in both peract and rvt
+# some of them donot support every possible combination in peract
+_C.peract = CN()
+_C.peract.lambda_weight_l2 = 1e-6
+# lr should be thought on per sample basis
+# effective lr is multiplied by bs * num_devices
+_C.peract.lr = 2.5e-5
+_C.peract.optimizer_type = "lamb"
+_C.peract.warmup_steps = 0
+_C.peract.lr_cos_dec = False
+_C.peract.add_rgc_loss = True
+_C.peract.num_rotation_classes = 72
+_C.peract.amp = False
+_C.peract.bnb = False
+_C.peract.transform_augmentation = True
+_C.peract.transform_augmentation_xyz = [0.1, 0.1, 0.1]
+_C.peract.transform_augmentation_rpy = [0.0, 0.0, 20.0]
+# arguments present in only rvt and not peract
+_C.rvt = CN()
+_C.rvt.gt_hm_sigma = 1.5
+_C.rvt.img_aug = 0.1
+_C.rvt.place_with_mean = True
+_C.rvt.move_pc_in_bound = True
+# arguments present in peract official
+_C.peract_official = CN()
+_C.peract_official.cfg_path = "configs/peract_official_config.yaml"
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    return _C.clone()

third_party/AnyBimanual/agents/rvt/rvt/configs/peract_official_config.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+# copied from: https://github.com/peract/peract/releases/download/v1.0.0/peract_600k.zip
+method:
+  name: PERACT_BC
+  lr: 0.0005
+  lr_scheduler: false
+  num_warmup_steps: 3000
+  optimizer: lamb
+  activation: lrelu
+  norm: None
+  lambda_weight_l2: 1.0e-06
+  trans_loss_weight: 1.0
+  rot_loss_weight: 1.0
+  grip_loss_weight: 1.0
+  collision_loss_weight: 1.0
+  rotation_resolution: 5
+  image_crop_size: 64
+  bounds_offset:
+  - 0.15
+  voxel_sizes:
+  - 100
+  num_latents: 2048
+  latent_dim: 512
+  transformer_depth: 6
+  transformer_iterations: 1
+  cross_heads: 1
+  cross_dim_head: 64
+  latent_heads: 8
+  latent_dim_head: 64
+  pos_encoding_with_lang: false
+  lang_fusion_type: seq
+  voxel_patch_size: 5
+  voxel_patch_stride: 5
+  input_dropout: 0.1
+  attn_dropout: 0.1
+  decoder_dropout: 0.0
+  crop_augmentation: true
+  final_dim: 64
+  transform_augmentation:
+    apply_se3: true
+    aug_xyz:
+    - 0.125
+    - 0.125
+    - 0.125
+    aug_rpy:
+    - 0.0
+    - 0.0
+    - 0.0
+    aug_rot_resolution: 5
+  demo_augmentation: true
+  demo_augmentation_every_n: 10
+  no_skip_connection: false
+  no_perceiver: false
+  no_language: false
+  keypoint_method: heuristic
+ddp:
+  master_addr: "localhost"
+  master_port: "29500"
+  num_devices: 1
+rlbench:
+  task_name: multi
+  tasks:
+  - change_channel
+  - close_jar
+  - insert_onto_square_peg
+  - light_bulb_in
+  - meat_off_grill
+  - open_drawer
+  - place_cups
+  - place_shape_in_shape_sorter
+  - push_buttons
+  - put_groceries_in_cupboard
+  - put_item_in_drawer
+  - put_money_in_safe
+  - reach_and_drag
+  - stack_blocks
+  - stack_cups
+  - turn_tap
+  - set_clock_to_time
+  - place_wine_at_rack_location
+  - put_rubbish_in_color_bin
+  - slide_block_to_color_target
+  - sweep_to_dustpan_of_size
+  demos: 100
+  demo_path: /raid/dataset/
+  episode_length: 25
+  cameras:
+  - front
+  - left_shoulder
+  - right_shoulder
+  - wrist
+  camera_resolution:
+  - 128
+  - 128
+  scene_bounds:
+  - -0.3
+  - -0.5
+  - 0.6
+  - 0.7
+  - 0.5
+  - 1.6
+  include_lang_goal_in_obs: True
+replay:
+  batch_size: 16
+  timesteps: 1
+  prioritisation: false
+  task_uniform: true
+  use_disk: true
+  path: /raid/arm/replay
+  max_parallel_processes: 32
+framework:
+  log_freq: 100
+  save_freq: 10000
+  train_envs: 1
+  replay_ratio: 16
+  transitions_before_train: 200
+  tensorboard_logging: true
+  csv_logging: true
+  training_iterations: 600001
+  gpu: 0
+  env_gpu: 0
+  logdir: /home/user/workspace/logs_may16_n100
+  seeds: 1
+  start_seed: 0
+  load_existing_weights: true
+  num_weights_to_keep: 60
+  record_every_n: 5

third_party/AnyBimanual/agents/rvt/rvt/configs/rvt.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+exp_id: rvt
+tasks: all
+bs: 3
+num_workers: 3
+epochs: 15
+sample_distribution_mode: task_uniform
+peract:
+  lr: 1e-4
+  warmup_steps: 2000
+  optimizer_type: lamb
+  lr_cos_dec: True
+  transform_augmentation_xyz: [0.125, 0.125, 0.125]
+  transform_augmentation_rpy: [0.0, 0.0, 45.0]
+rvt:
+  place_with_mean: False

third_party/AnyBimanual/agents/rvt/rvt/configs/rvt2.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+exp_id: rvt2
+tasks: all
+bs: 24
+num_workers: 3
+epochs: 15
+sample_distribution_mode: task_uniform
+peract:
+  lr: 1.25e-5
+  warmup_steps: 2000
+  optimizer_type: lamb
+  lr_cos_dec: True
+  transform_augmentation_xyz: [0.125, 0.125, 0.125]
+  transform_augmentation_rpy: [0.0, 0.0, 45.0]
+  amp: True
+  bnb: True
+  lambda_weight_l2: 1e-4
+rvt:
+  place_with_mean: False
+  img_aug: 0.0

third_party/AnyBimanual/agents/rvt/rvt/eval.py ADDED Viewed

	@@ -0,0 +1,556 @@

+# Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the NVIDIA Source Code License [see LICENSE for details].
+import os
+import yaml
+import csv
+import torch
+import cv2
+import shutil
+import numpy as np
+from omegaconf import OmegaConf
+from multiprocessing import Value
+from tensorflow.python.summary.summary_iterator import summary_iterator
+from copy import deepcopy
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+os.environ["BITSANDBYTES_NOWELCOME"] = "1"
+from rlbench.backend import task as rlbench_task
+from rlbench.backend.utils import task_file_to_task_class
+from rlbench.action_modes.gripper_action_modes import Discrete
+from rlbench.action_modes.action_mode import MoveArmThenGripper
+from yarr.utils.rollout_generator import RolloutGenerator
+from yarr.utils.stat_accumulator import SimpleAccumulator
+from yarr.utils.log_writer import LogWriter
+from yarr.agents.agent import VideoSummary
+import rvt.mvt.config as default_mvt_cfg
+import rvt.models.rvt_agent as rvt_agent
+import rvt.config as default_exp_cfg
+from rvt.mvt.mvt import MVT
+from rvt.libs.peract.helpers import utils
+from rvt.utils.custom_rlbench_env import (
+    CustomMultiTaskRLBenchEnv2 as CustomMultiTaskRLBenchEnv,
+)
+from rvt.utils.peract_utils import (
+    CAMERAS,
+    SCENE_BOUNDS,
+    IMAGE_SIZE,
+    get_official_peract,
+)
+from rvt.utils.rlbench_planning import (
+    EndEffectorPoseViaPlanning2 as EndEffectorPoseViaPlanning,
+)
+from rvt.utils.rvt_utils import (
+    TensorboardManager,
+    get_eval_parser,
+    RLBENCH_TASKS,
+)
+from rvt.utils.rvt_utils import load_agent as load_agent_state
+def load_agent(
+    model_path=None,
+    peract_official=False,
+    peract_model_dir=None,
+    exp_cfg_path=None,
+    mvt_cfg_path=None,
+    eval_log_dir="",
+    device=0,
+    use_input_place_with_mean=False,
+):
+    device = f"cuda:{device}"
+    if not (peract_official):
+        assert model_path is not None
+        # load exp_cfg
+        model_folder = os.path.join(os.path.dirname(model_path))
+        exp_cfg = default_exp_cfg.get_cfg_defaults()
+        if exp_cfg_path != None:
+            exp_cfg.merge_from_file(exp_cfg_path)
+        else:
+            exp_cfg.merge_from_file(os.path.join(model_folder, "exp_cfg.yaml"))
+        # NOTE: to not use place_with_mean in evaluation
+        # needed for rvt-1 but not rvt-2
+        if not use_input_place_with_mean:
+            # for backward compatibility
+            old_place_with_mean = exp_cfg.rvt.place_with_mean
+            exp_cfg.rvt.place_with_mean = True
+        exp_cfg.freeze()
+        # create agent
+        if exp_cfg.agent == "original":
+            # initialize PerceiverIO Transformer
+            VOXEL_SIZES = [100]  # 100x100x100 voxels
+            NUM_LATENTS = 512  # PerceiverIO latents
+            BATCH_SIZE_TRAIN = 1
+            perceiver_encoder = PerceiverIO(
+                depth=6,
+                iterations=1,
+                voxel_size=VOXEL_SIZES[0],
+                initial_dim=3 + 3 + 1 + 3,
+                low_dim_size=4,
+                layer=0,
+                num_rotation_classes=72,
+                num_grip_classes=2,
+                num_collision_classes=2,
+                num_latents=NUM_LATENTS,
+                latent_dim=512,
+                cross_heads=1,
+                latent_heads=8,
+                cross_dim_head=64,
+                latent_dim_head=64,
+                weight_tie_layers=False,
+                activation="lrelu",
+                input_dropout=0.1,
+                attn_dropout=0.1,
+                decoder_dropout=0.0,
+                voxel_patch_size=5,
+                voxel_patch_stride=5,
+                final_dim=64,
+            )
+            # initialize PerceiverActor
+            agent = PerceiverActorAgent(
+                coordinate_bounds=SCENE_BOUNDS,
+                perceiver_encoder=perceiver_encoder,
+                camera_names=CAMERAS,
+                batch_size=BATCH_SIZE_TRAIN,
+                voxel_size=VOXEL_SIZES[0],
+                voxel_feature_size=3,
+                num_rotation_classes=72,
+                rotation_resolution=5,
+                image_resolution=[IMAGE_SIZE, IMAGE_SIZE],
+                transform_augmentation=False,
+                **exp_cfg.peract,
+            )
+        elif exp_cfg.agent == "our":
+            mvt_cfg = default_mvt_cfg.get_cfg_defaults()
+            if mvt_cfg_path != None:
+                mvt_cfg.merge_from_file(mvt_cfg_path)
+            else:
+                mvt_cfg.merge_from_file(os.path.join(model_folder, "mvt_cfg.yaml"))
+            mvt_cfg.freeze()
+            # for rvt-2 we do not change place_with_mean regardless of the arg
+            # done this way to ensure backward compatibility and allow the
+            # flexibility for rvt-1
+            if mvt_cfg.stage_two:
+                exp_cfg.defrost()
+                exp_cfg.rvt.place_with_mean = old_place_with_mean
+                exp_cfg.freeze()
+            rvt = MVT(
+                renderer_device=device,
+                **mvt_cfg,
+            )
+            agent = rvt_agent.RVTAgent(
+                network=rvt.to(device),
+                image_resolution=[IMAGE_SIZE, IMAGE_SIZE],
+                add_lang=mvt_cfg.add_lang,
+                stage_two=mvt_cfg.stage_two,
+                rot_ver=mvt_cfg.rot_ver,
+                scene_bounds=SCENE_BOUNDS,
+                cameras=CAMERAS,
+                log_dir=f"{eval_log_dir}/eval_run",
+                **exp_cfg.peract,
+                **exp_cfg.rvt,
+            )
+        else:
+            raise NotImplementedError
+        agent.build(training=False, device=device)
+        load_agent_state(model_path, agent)
+        agent.eval()
+    elif peract_official:  # load official peract model, using the provided code
+        try:
+            model_folder = os.path.join(os.path.abspath(peract_model_dir), "..", "..")
+            train_cfg_path = os.path.join(model_folder, "config.yaml")
+            agent = get_official_peract(train_cfg_path, False, device, bs=1)
+        except FileNotFoundError:
+            print("Config file not found, trying to load again in our format")
+            train_cfg_path = "configs/peract_official_config.yaml"
+            agent = get_official_peract(train_cfg_path, False, device, bs=1)
+        agent.load_weights(peract_model_dir)
+        agent.eval()
+    print("Agent Information")
+    print(agent)
+    return agent
+@torch.no_grad()
+def eval(
+    agent,
+    tasks,
+    eval_datafolder,
+    start_episode=0,
+    eval_episodes=25,
+    episode_length=25,
+    replay_ground_truth=False,
+    device=0,
+    headless=True,
+    logging=False,
+    log_dir=None,
+    verbose=True,
+    save_video=False,
+):
+    agent.eval()
+    if isinstance(agent, rvt_agent.RVTAgent):
+        agent.load_clip()
+    camera_resolution = [IMAGE_SIZE, IMAGE_SIZE]
+    obs_config = utils.create_obs_config(CAMERAS, camera_resolution, method_name="")
+    gripper_mode = Discrete()
+    arm_action_mode = EndEffectorPoseViaPlanning()
+    action_mode = MoveArmThenGripper(arm_action_mode, gripper_mode)
+    task_files = [
+        t.replace(".py", "")
+        for t in os.listdir(rlbench_task.TASKS_PATH)
+        if t != "__init__.py" and t.endswith(".py")
+    ]
+    task_classes = []
+    if tasks[0] == "all":
+        tasks = RLBENCH_TASKS
+        if verbose:
+            print(f"evaluate on {len(tasks)} tasks: ", tasks)
+    for task in tasks:
+        if task not in task_files:
+            raise ValueError("Task %s not recognised!." % task)
+        task_classes.append(task_file_to_task_class(task))
+    eval_env = CustomMultiTaskRLBenchEnv(
+        task_classes=task_classes,
+        observation_config=obs_config,
+        action_mode=action_mode,
+        dataset_root=eval_datafolder,
+        episode_length=episode_length,
+        headless=headless,
+        swap_task_every=eval_episodes,
+        include_lang_goal_in_obs=True,
+        time_in_state=True,
+        record_every_n=1 if save_video else -1,
+    )
+    eval_env.eval = True
+    device = f"cuda:{device}"
+    if logging:
+        assert log_dir is not None
+        # create metric saving writer
+        csv_file = "eval_results.csv"
+        if not os.path.exists(os.path.join(log_dir, csv_file)):
+            with open(os.path.join(log_dir, csv_file), "w") as csv_fp:
+                fieldnames = ["task", "success rate", "length", "total_transitions"]
+                csv_writer = csv.DictWriter(csv_fp, fieldnames=fieldnames)
+                csv_writer.writeheader()
+    # evaluate agent
+    rollout_generator = RolloutGenerator(device)
+    stats_accumulator = SimpleAccumulator(eval_video_fps=30)
+    eval_env.launch()
+    current_task_id = -1
+    num_tasks = len(tasks)
+    step_signal = Value("i", -1)
+    scores = []
+    for task_id in range(num_tasks):
+        task_rewards = []
+        for ep in range(start_episode, start_episode + eval_episodes):
+            episode_rollout = []
+            generator = rollout_generator.generator(
+                step_signal=step_signal,
+                env=eval_env,
+                agent=agent,
+                episode_length=episode_length,
+                timesteps=1,
+                eval=True,
+                eval_demo_seed=ep,
+                record_enabled=False,
+                replay_ground_truth=replay_ground_truth,
+            )
+            try:
+                for replay_transition in generator:
+                    episode_rollout.append(replay_transition)
+            except StopIteration as e:
+                continue
+            except Exception as e:
+                eval_env.shutdown()
+                raise e
+            for transition in episode_rollout:
+                stats_accumulator.step(transition, True)
+                current_task_id = transition.info["active_task_id"]
+                assert current_task_id == task_id
+            task_name = tasks[task_id]
+            reward = episode_rollout[-1].reward
+            task_rewards.append(reward)
+            lang_goal = eval_env._lang_goal
+            if verbose:
+                print(
+                    f"Evaluating {task_name} | Episode {ep} | Score: {reward} | Episode Length: {len(episode_rollout)} | Lang Goal: {lang_goal}"
+                )
+        # report summaries
+        summaries = []
+        summaries.extend(stats_accumulator.pop())
+        task_name = tasks[task_id]
+        if logging:
+            # writer csv first
+            with open(os.path.join(log_dir, csv_file), "a") as csv_fp:
+                fieldnames = ["task", "success rate", "length", "total_transitions"]
+                csv_writer = csv.DictWriter(csv_fp, fieldnames=fieldnames)
+                csv_results = {"task": task_name}
+                for s in summaries:
+                    if s.name == "eval_envs/return":
+                        csv_results["success rate"] = s.value
+                    elif s.name == "eval_envs/length":
+                        csv_results["length"] = s.value
+                    elif s.name == "eval_envs/total_transitions":
+                        csv_results["total_transitions"] = s.value
+                    if "eval" in s.name:
+                        s.name = "%s/%s" % (s.name, task_name)
+                csv_writer.writerow(csv_results)
+        else:
+            for s in summaries:
+                if "eval" in s.name:
+                    s.name = "%s/%s" % (s.name, task_name)
+        if len(summaries) > 0:
+            task_score = [
+                s.value for s in summaries if f"eval_envs/return/{task_name}" in s.name
+            ][0]
+        else:
+            task_score = "unknown"
+        print(f"[Evaluation] Finished {task_name} | Final Score: {task_score}\n")
+        scores.append(task_score)
+        if save_video:
+            video_image_folder = "./tmp"
+            record_fps = 25
+            record_folder = os.path.join(log_dir, "videos")
+            os.makedirs(record_folder, exist_ok=True)
+            video_success_cnt = 0
+            video_fail_cnt = 0
+            video_cnt = 0
+            for summary in summaries:
+                if isinstance(summary, VideoSummary):
+                    video = deepcopy(summary.value)
+                    video = np.transpose(video, (0, 2, 3, 1))
+                    video = video[:, :, :, ::-1]
+                    if task_rewards[video_cnt] > 99:
+                        video_path = os.path.join(
+                            record_folder,
+                            f"{task_name}_success_{video_success_cnt}.mp4",
+                        )
+                        video_success_cnt += 1
+                    else:
+                        video_path = os.path.join(
+                            record_folder, f"{task_name}_fail_{video_fail_cnt}.mp4"
+                        )
+                        video_fail_cnt += 1
+                    video_cnt += 1
+                    os.makedirs(video_image_folder, exist_ok=True)
+                    for idx in range(len(video) - 10):
+                        cv2.imwrite(
+                            os.path.join(video_image_folder, f"{idx}.png"), video[idx]
+                        )
+                    images_path = os.path.join(video_image_folder, r"%d.png")
+                    os.system(
+                        "ffmpeg -i {} -vf palettegen palette.png -hide_banner -loglevel error".format(
+                            images_path
+                        )
+                    )
+                    os.system(
+                        "ffmpeg -framerate {} -i {} -i palette.png -lavfi paletteuse {} -hide_banner -loglevel error".format(
+                            record_fps, images_path, video_path
+                        )
+                    )
+                    os.remove("palette.png")
+                    shutil.rmtree(video_image_folder)
+    eval_env.shutdown()
+    if logging:
+        csv_fp.close()
+    # set agent to back train mode
+    agent.train()
+    # unloading clip to save memory
+    if isinstance(agent, rvt_agent.RVTAgent):
+        agent.unload_clip()
+        agent._network.free_mem()
+    return scores
+def get_model_index(filename):
+    """
+    :param filenam: path of file of format /.../model_idx.pth
+    :return: idx or None
+    """
+    if len(filename) >= 9 and filename[-4:] == ".pth":
+        try:
+            index = int(filename[:-4].split("_")[-1])
+        except:
+            index = None
+    else:
+        index = None
+    return index
+def _eval(args):
+    model_paths = []
+    if not (args.peract_official):
+        assert args.model_name is not None
+        model_paths.append(os.path.join(args.model_folder, args.model_name))
+    else:
+        model_paths.append(None)
+    # skipping evaluated models
+    if args.skip:
+        """
+        to_skip: {
+            0: {'light_bulb_in': False, .....}
+            1: {'light_bulb_in': False, .....}
+            .
+            .
+        }
+        """
+        to_skip = {
+            get_model_index(x): {y: False for y in args.tasks} for x in model_paths
+        }
+        filenames = os.listdir(args.eval_log_dir)
+        for filename in filenames:
+            if not filename.startswith("events.out.tfevents."):
+                continue
+            summ = summary_iterator(f"{args.eval_log_dir}/{filename}")
+            # skipping the time log of the summary
+            try:
+                next(summ)
+            except:
+                # moving to the next file
+                continue
+            for cur_summ in summ:
+                cur_task = cur_summ.summary.value[0].tag[5:]
+                cur_step = cur_summ.step
+                if cur_step in to_skip:
+                    to_skip[cur_step][cur_task] = True
+    tb = TensorboardManager(args.eval_log_dir)
+    for model_path in model_paths:
+        tasks_to_eval = deepcopy(args.tasks)
+        if args.peract_official:
+            model_idx = 0
+        else:
+            model_idx = get_model_index(model_path)
+            if model_idx is None:
+                model_idx = 0
+        if args.skip:
+            for _task in args.tasks:
+                if to_skip[model_idx][_task]:
+                    tasks_to_eval.remove(_task)
+            if len(tasks_to_eval) == 0:
+                print(f"Skipping model_idx={model_idx} for args.tasks={args.tasks}")
+                continue
+        if not (args.peract_official):
+            agent = load_agent(
+                model_path=model_path,
+                exp_cfg_path=args.exp_cfg_path,
+                mvt_cfg_path=args.mvt_cfg_path,
+                eval_log_dir=args.eval_log_dir,
+                device=args.device,
+                use_input_place_with_mean=args.use_input_place_with_mean,
+            )
+            agent_eval_log_dir = os.path.join(
+                args.eval_log_dir, os.path.basename(model_path).split(".")[0]
+            )
+        else:
+            agent = load_agent(
+                peract_official=args.peract_official,
+                peract_model_dir=args.peract_model_dir,
+                device=args.device,
+                use_input_place_with_mean=args.use_input_place_with_mean,
+            )
+            agent_eval_log_dir = os.path.join(args.eval_log_dir, "final")
+        os.makedirs(agent_eval_log_dir, exist_ok=True)
+        scores = eval(
+            agent=agent,
+            tasks=tasks_to_eval,
+            eval_datafolder=args.eval_datafolder,
+            start_episode=args.start_episode,
+            eval_episodes=args.eval_episodes,
+            episode_length=args.episode_length,
+            replay_ground_truth=args.ground_truth,
+            device=args.device,
+            headless=args.headless,
+            logging=True,
+            log_dir=agent_eval_log_dir,
+            verbose=True,
+            save_video=args.save_video,
+        )
+        print(f"model {model_path}, scores {scores}")
+        task_scores = {}
+        for i in range(len(tasks_to_eval)):
+            task_scores[tasks_to_eval[i]] = scores[i]
+        print("save ", task_scores)
+        tb.update("eval", model_idx, task_scores)
+        tb.writer.flush()
+    tb.close()
+if __name__ == "__main__":
+    parser = get_eval_parser()
+    args = parser.parse_args()
+    if args.log_name is None:
+        args.log_name = "none"
+    if not (args.peract_official):
+        args.eval_log_dir = os.path.join(args.model_folder, "eval", args.log_name)
+    else:
+        args.eval_log_dir = os.path.join(args.peract_model_dir, "eval", args.log_name)
+    os.makedirs(args.eval_log_dir, exist_ok=True)
+    # save the arguments for future reference
+    with open(os.path.join(args.eval_log_dir, "eval_config.yaml"), "w") as fp:
+        yaml.dump(args.__dict__, fp)
+    _eval(args)

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ pcd_data.tar.gz filter=lfs diff=lfs merge=lfs -text

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+build/*
+*.egg-info/*
+*.so
+*/__pycache__/*

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/LICENSE ADDED Viewed

	@@ -0,0 +1,97 @@

+Copyright (c) 2022-2023, NVIDIA Corporation & affiliates. All rights reserved.
+NVIDIA Source Code License for instant neural graphics primitives
+=======================================================================
+1. Definitions
+"Licensor" means any person or entity that distributes its Work.
+"Software" means the original work of authorship made available under
+this License.
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+2. License Grants
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+3. Limitations
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+=======================================================================

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/README.md ADDED Viewed

	@@ -0,0 +1,55 @@

+## Point Renderer
+A minimal, lightweight CUDA-accelerated renderer of pointclouds.
+<div align="center"><img src="demo.png"/></div>
+### Install
+```
+pip install -r requirements.txt
+pip install -e .
+```
+### Run
+**Load Data**
+Extract included pcd_data.tar.gz
+```
+import numpy as np
+data = np.load("pcd_data/w1280_h720/3.npy", allow_pickle=True)
+data = data[None][0]
+pc = data["pc"]
+rgb = data["img_feat"]
+```
+**Render the image**
+```
+# Make the renderer
+from point_renderer.renderer import PointRenderer
+renderer = PointRenderer(device="cuda", perf_timer=False)
+# Define a batch of cameras
+img_size = (512, 512)
+K = renderer.get_camera_intrinsics(hfov=70, img_size=img_size)
+camera_poses = renderer.get_batch_of_camera_poses(
+    cam_positions=[[1.5, 1.5, 1.5],[-1.5, -1.5, -1.5]],
+    cam_lookats=[[0.0, 0.0, 0.0],[0.0, 0.0, 0.0]])
+# Render the pointcloud from the given cameras
+images, depths = renderer.render_batch(pc, rgb, camera_poses, K, img_size,
+                                       default_color=1.0,
+                                       splat_radius=0.005,
+                                       aa_factor=2
+                                      )
+# Show the results
+plt.imshow(images[0].detach().cpu().numpy()); plt.show()
+plt.imshow(depths[0].detach().cpu().numpy()); plt.show()
+plt.imshow(images[1].detach().cpu().numpy()); plt.show()
+plt.imshow(depths[1].detach().cpu().numpy()); plt.show()
+```
+.. Or run the jupyter notebook that has this same code above, and also all the benchmarks.

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/demo.png ADDED Viewed

Git LFS Details

SHA256: 67abe7c267d53a8a62e8a383412032dcedbc6373162309ba5652031cfc780a7d
Pointer size: 131 Bytes
Size of remote file: 253 kB

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/image_0_splat_2xaa.png ADDED Viewed

third_party/AnyBimanual/agents/rvt/rvt/libs/point-renderer/point_renderer/cameras.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+from point_renderer import ops
+from functools import lru_cache
+@lru_cache(maxsize=32)
+def linalg_inv(poses):
+    return torch.linalg.inv(poses)
+class Cameras:
+    def __init__(self, poses, intrinsics, img_size, inv_poses=None):
+        self.poses = poses
+        self.img_size = img_size
+        if inv_poses is None:
+            self.inv_poses = linalg_inv(poses)
+        else:
+            self.inv_poses = inv_poses
+        self.intrinsics = intrinsics
+    def __len__(self):
+        return len(self.poses)
+    def scale(self, constant):
+        self.intrinsics = self.intrinsics.clone()
+        self.intrinsics[:, :2, :3] *= constant
+    def is_orthographic(self):
+        raise ValueError("is_orthographic should be called on child classes only")
+    def is_perspective(self):
+        raise ValueError("is_perspective should be called on child classes only")
+class PerspectiveCameras(Cameras):
+    def __init__(self, poses, intrinsics, img_size, inv_poses=None):
+        super().__init__(poses, intrinsics, img_size, inv_poses)
+    @classmethod
+    def from_lookat(cls, eyes, ats, ups, hfov, img_size, device="cpu"):
+        cam_poses = []
+        for eye, at, up in zip(eyes, ats, ups):
+            T = ops.lookat_to_cam_pose(eye, at, up, device=device)
+            cam_poses.append(T)
+        cam_poses = torch.stack(cam_poses, dim=0)
+        intrinsics = ops.fov_and_size_to_intrinsics(hfov, img_size, device=device)
+        intrinsics = intrinsics[None, :, :].repeat((cam_poses.shape[0], 1, 1)).contiguous()
+        return PerspectiveCameras(cam_poses, intrinsics, img_size)
+    @classmethod
+    def from_rotation_and_translation(cls, R, T, S, hfov, img_size):
+        device = R.device
+        assert T.device == device
+        cam_poses = torch.zeros((R.shape[0], 4, 4), device=device, dtype=torch.float)
+        cam_poses[:, :3, :3] = R * S[None, :]
+        cam_poses[:, :3, 3] = T
+        cam_poses[:, 3, 3] = 1.0
+        intrinsics = ops.fov_and_size_to_intrinsics(hfov, img_size, device=device)
+        intrinsics = intrinsics[None, :, :].repeat((cam_poses.shape[0], 1, 1)).contiguous()
+        return PerspectiveCameras(cam_poses, intrinsics, img_size)
+    def to(self, device):
+        return PerspectiveCameras(self.poses.to(device), self.intrinsics.to(device), self.inv_poses.to(device))
+    def is_orthographic(self):
+        return False
+    def is_perspective(self):
+        return True
+class OrthographicCameras(Cameras):
+    def __init__(self, poses, intrinsics, img_size, inv_poses=None):
+        super().__init__(poses, intrinsics, img_size, inv_poses)
+    @classmethod
+    def from_lookat(cls, eyes, ats, ups, img_sizes_w, img_size_px, device="cpu"):
+        """
+        Args:
+            eyes: Nx3 tensor of camera coordinates
+            ats: Nx3 tensor of look-at directions
+            ups: Nx3 tensor of up-vectors
+            scale: Nx2 tensor defining image sizes in world coordinates
+            img_size: 2-dim tuple defining image size in pixels
+        Returns:
+            OrthographicCamera
+        """
+        if isinstance(img_sizes_w, list):
+            img_sizes_w = torch.tensor(img_sizes_w, device=device)[None, :].repeat((len(eyes), 1))
+        cam_poses = []
+        for eye, at, up in zip(eyes, ats, ups):
+            T = ops.lookat_to_cam_pose(eye, at, up, device=device)
+            cam_poses.append(T)
+        cam_poses = torch.stack(cam_poses, dim=0)
+        intrinsics = ops.orthographic_intrinsics_from_scales(img_sizes_w, img_size_px, device=device)
+        return OrthographicCameras(cam_poses, intrinsics, img_size_px)
+    @classmethod
+    def from_rotation_and_translation(cls, R, T, img_sizes_w, img_size_px, device="cpu"):
+        if isinstance(img_sizes_w, list):
+            img_sizes_w = torch.tensor(img_sizes_w, device=device)[None, :].repeat((len(R), 1))
+        device = R.device
+        assert T.device == device
+        cam_poses = torch.zeros((R.shape[0], 4, 4), device=device, dtype=torch.float)
+        cam_poses[:, :3, :3] = R
+        cam_poses[:, :3, 3] = T
+        cam_poses[:, 3, 3] = 1.0
+        intrinsics = ops.orthographic_intrinsics_from_scales(img_sizes_w, img_size_px, device=device)
+        intrinsics = intrinsics[None, :, :].repeat((cam_poses.shape[0], 1, 1)).contiguous()
+        return OrthographicCameras(cam_poses, intrinsics, img_size_px)
+    def to(self, device):
+        return OrthographicCameras(self.poses.to(device), self.intrinsics.to(device), self.inv_poses.to(device))
+    def is_orthographic(self):
+        return True
+    def is_perspective(self):
+        return False