diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000000000000000000000000000000000000..f5b44d40df57bd3a4bb9fef9ae896fa82b6f078d
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,9 @@
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=
+GITHUB_OWNER=
+GITHUB_REPO=
+GITHUB_TOKEN=
+PROJECT_NAME=
+# optional
+OPENAI_API_KEY=
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..0ff800acd34ba0e9345a2efb55e1fff8be94bfc1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/method2_xyz.png filter=lfs diff=lfs merge=lfs -text
+assets/som_gpt4v_demo.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+ops/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1dc75c44d8a3668b2dde1b08ebf2d5e05dbc9dc8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+*.sw[m-p]
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..827f579ebdc6d179e56da951e95efb93203517a8
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y \
+      python3-pip python3-dev git ninja-build wget \
+      ffmpeg libsm6 libxext6 \
+      openmpi-bin libopenmpi-dev && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    ln -sf /usr/bin/pip3 /usr/bin/pip
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+ENV FORCE_CUDA=1
+
+# Upgrade pip
+RUN python -m pip install --upgrade pip
+
+# Install Python dependencies
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu123 \
+    && pip install git+https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git@33f2c898fdc8d7c95dda014a4b9ebe4e413dbb2b \
+    && pip install git+https://github.com/facebookresearch/segment-anything.git \
+    && pip install git+https://github.com/UX-Decoder/Semantic-SAM.git@package \
+    && cd ops && bash make.sh && cd .. \
+    && pip install mpi4py \
+    && pip install openai \
+    && pip install gradio==4.17.0
+
+# Download pretrained models
+RUN sh download_ckpt.sh
+
+# Make port 6092 available to the world outside this container
+EXPOSE 6092
+
+# Make Gradio server accessible outside 127.0.0.1
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+
+RUN chmod +x /usr/src/app/entrypoint.sh
+CMD ["/usr/src/app/entrypoint.sh"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
diff --git a/README.md b/README.md
index 621dc17ffb3c189abde71b44d5344203b20707d0..e49a8271315f49ff63e535b0d31413d70d2a6d19 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,170 @@
 ---
 title: SoM
-emoji: 🏃
-colorFrom: green
-colorTo: pink
+app_file: demo_som.py
 sdk: gradio
-sdk_version: 4.27.0
-app_file: app.py
-pinned: false
+sdk_version: 4.17.0
 ---
+# <img src="assets/som_logo.png" alt="Logo" width="40" height="40" align="left"> Set-of-Mark Visual Prompting for GPT-4V
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+:grapes: \[[Read our arXiv Paper](https://arxiv.org/pdf/2310.11441.pdf)\] &nbsp; :apple: \[[Project Page](https://som-gpt4v.github.io/)\] 
+
+[Jianwei Yang](https://jwyang.github.io/)\*⚑, [Hao Zhang](https://haozhang534.github.io/)\*, [Feng Li](https://fengli-ust.github.io/)\*, [Xueyan Zou](https://maureenzou.github.io/)\*, [Chunyuan Li](https://chunyuan.li/), [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/)
+
+\* Core Contributors &nbsp;&nbsp;&nbsp;&nbsp; ⚑ Project Lead
+
+### Introduction
+
+We present **S**et-**o**f-**M**ark (SoM) prompting, simply overlaying a number of spatial and speakable marks on the images, to unleash the visual grounding abilities in the strongest LMM -- GPT-4V. **Let's using visual prompting for vision**!
+
+![method2_xyz](https://github.com/microsoft/SoM/assets/34880758/32a269c4-8465-4eaf-aa90-48e9534649d9)
+
+
+### GPT-4V + SoM Demo
+
+https://github.com/microsoft/SoM/assets/3894247/8f827871-7ebd-4a5e-bef5-861516c4427b
+
+### 🔥 News
+
+* [11/21] Thanks to Roboflow and @SkalskiP, a [huggingface demo](https://huggingface.co/spaces/Roboflow/SoM) for SoM + GPT-4V is online! Try it out!
+* [11/07] We released the vision benchmark we used to evaluate GPT-4V with SoM prompting! Check out the [benchmark page](https://github.com/microsoft/SoM/tree/main/benchmark)!
+
+* [11/07] Now that GPT-4V API has been released, we are releasing a demo integrating SoM into GPT-4V! 
+```bash
+export OPENAI_API_KEY=YOUR_API_KEY
+python demo_gpt4v_som.py
+```
+
+* [10/23] We released the SoM toolbox code for generating set-of-mark prompts for GPT-4V. Try it out!
+
+### 🔗 Fascinating Applications
+
+Fascinating applications of SoM in GPT-4V:
+* [11/13/2023] [Smartphone GUI Navigation boosted by Set-of-Mark Prompting](https://github.com/zzxslp/MM-Navigator)
+* [11/05/2023] [Zero-shot Anomaly Detection with GPT-4V and SoM prompting](https://github.com/zhangzjn/GPT-4V-AD)
+* [10/21/2023] [Web UI Navigation Agent inspired by Set-of-Mark Prompting](https://github.com/ddupont808/GPT-4V-Act)
+* [10/20/2023] [Set-of-Mark Prompting Reimplementation by @SkalskiP from Roboflow](https://github.com/SkalskiP/SoM.git)
+
+### 🔗 Related Works
+
+Our method compiles the following models to generate the set of marks:
+
+- [Mask DINO](https://github.com/IDEA-Research/MaskDINO): State-of-the-art closed-set image segmentation model
+- [OpenSeeD](https://github.com/IDEA-Research/OpenSeeD): State-of-the-art open-vocabulary image segmentation model
+- [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO): State-of-the-art open-vocabulary object detection model
+- [SEEM](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once): Versatile, promptable, interactive and semantic-aware segmentation model
+- [Semantic-SAM](https://github.com/UX-Decoder/Semantic-SAM): Segment and recognize anything at any granularity
+- [Segment Anything](https://github.com/facebookresearch/segment-anything): Segment anything
+
+We are standing on the shoulder of the giant GPT-4V ([playground](https://chat.openai.com/))!
+
+### :rocket: Quick Start
+
+* Install segmentation packages
+
+```bash
+# install SEEM
+pip install git+https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git@package
+# install SAM
+pip install git+https://github.com/facebookresearch/segment-anything.git
+# install Semantic-SAM
+pip install git+https://github.com/UX-Decoder/Semantic-SAM.git@package
+# install Deformable Convolution for Semantic-SAM
+cd ops && sh make.sh && cd ..
+
+# common error fix:
+python -m pip install 'git+https://github.com/MaureenZOU/detectron2-xyz.git'
+```
+
+* Download the pretrained models
+
+```bash
+sh download_ckpt.sh
+```
+
+* Run the demo
+
+```bash
+python demo_som.py
+```
+
+And you will see this interface:
+
+![som_toolbox](assets/som_toolbox_interface.jpg)
+
+## Deploy to AWS
+
+To deploy SoM to EC2 on AWS via Github Actions:
+
+1. Fork this repository and clone your fork to your local machine.
+2. Follow the instructions at the top of `deploy.py`.
+
+### :point_right: Comparing standard GPT-4V and its combination with SoM Prompting
+![teaser_github](https://github.com/microsoft/SoM/assets/11957155/e4720105-b4b2-40c0-9303-2d8f1cb27d91)
+
+### :round_pushpin: SoM Toolbox for image partition
+![method3_xyz](https://github.com/microsoft/SoM/assets/34880758/2443572b-995a-4f29-95df-3e3fc0f510d6)
+Users can select which granularity of masks to generate, and which mode to use between automatic (top) and interactive (bottom). A higher alpha blending value (0.4) is used for better visualization.
+### :unicorn: Interleaved Prompt
+SoM enables interleaved prompts which include textual and visual content. The visual content can be represented using the region indices.
+<img width="975" alt="Screenshot 2023-10-18 at 10 06 18" src="https://github.com/microsoft/SoM/assets/34880758/859edfda-ab04-450c-bd28-93762460ac1d">
+
+### :medal_military: Mark types used in SoM
+![method4_xyz](https://github.com/microsoft/SoM/assets/34880758/a9cddc47-f975-4991-b35a-72c50813c092)
+### :volcano: Evaluation tasks examples
+<img width="946" alt="Screenshot 2023-10-18 at 10 12 18" src="https://github.com/microsoft/SoM/assets/34880758/f5e0c0b0-58de-4b60-bf01-4906dbcb229e">
+
+## Use case
+### :tulip: Grounded Reasoning and Cross-Image Reference
+
+<img width="972" alt="Screenshot 2023-10-18 at 10 10 41" src="https://github.com/microsoft/SoM/assets/34880758/033cd16c-876c-4c03-961e-590a4189bc9e">
+
+In comparison to GPT-4V without SoM, adding marks enables GPT-4V to ground the
+reasoning on detailed contents of the image (Left). Clear object cross-image references are observed
+on the right.
+17
+### :camping: Problem Solving
+<img width="972" alt="Screenshot 2023-10-18 at 10 18 03" src="https://github.com/microsoft/SoM/assets/34880758/8b112126-d164-47d7-b18c-b4b51b903d57">
+
+Case study on solving CAPTCHA. GPT-4V gives the wrong answer with a wrong number
+of squares while finding the correct squares with corresponding marks after SoM prompting.
+### :mountain_snow: Knowledge Sharing
+<img width="733" alt="Screenshot 2023-10-18 at 10 18 44" src="https://github.com/microsoft/SoM/assets/34880758/dc753c3f-ada8-47a4-83f1-1576bcfb146a">
+
+Case study on an image of dish for GPT-4V. GPT-4V does not produce a grounded answer
+with the original image. Based on SoM prompting, GPT-4V not only speaks out the ingredients but
+also corresponds them to the regions.
+### :mosque: Personalized Suggestion
+<img width="733" alt="Screenshot 2023-10-18 at 10 19 12" src="https://github.com/microsoft/SoM/assets/34880758/88188c90-84f2-49c6-812e-44770b0c2ca5">
+
+SoM-pormpted GPT-4V gives very precise suggestions while the original one fails, even
+with hallucinated foods, e.g., soft drinks
+### :blossom: Tool Usage Instruction
+<img width="734" alt="Screenshot 2023-10-18 at 10 19 39" src="https://github.com/microsoft/SoM/assets/34880758/9b35b143-96af-41bd-ad83-9c1f1e0f322f">
+Likewise, GPT4-V with SoM can help to provide thorough tool usage instruction
+, teaching
+users the function of each button on a controller. Note that this image is not fully labeled, while
+GPT-4V can also provide information about the non-labeled buttons.
+
+### :sunflower: 2D Game Planning
+<img width="730" alt="Screenshot 2023-10-18 at 10 20 03" src="https://github.com/microsoft/SoM/assets/34880758/0bc86109-5512-4dee-aac9-bab0ef96ed4c">
+
+GPT-4V with SoM gives a reasonable suggestion on how to achieve a goal in a gaming
+scenario.
+### :mosque: Simulated Navigation
+<img width="729" alt="Screenshot 2023-10-18 at 10 21 24" src="https://github.com/microsoft/SoM/assets/34880758/7f139250-5350-4790-a35c-444ec2ec883b">
+
+### :deciduous_tree: Results
+We conduct experiments on various vision tasks to verify the effectiveness of our SoM. Results show that GPT4V+SoM outperforms specialists on most vision tasks and is comparable to MaskDINO on COCO panoptic segmentation.
+![main_results](https://github.com/microsoft/SoM/assets/34880758/722ac979-6c7f-4740-9625-cac38060e0ad)
+
+## :black_nib: Citation
+
+If you find our work helpful for your research, please consider citing the following BibTeX entry.   
+```bibtex
+@article{yang2023setofmark,
+      title={Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V}, 
+      author={Jianwei Yang and Hao Zhang and Feng Li and Xueyan Zou and Chunyuan Li and Jianfeng Gao},
+      journal={arXiv preprint arXiv:2310.11441},
+      year={2023},
+}
+```
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3c89efc852e22f71eabf5dfbc6ac62493425eb6
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/SUPPORT.md b/SUPPORT.md
new file mode 100644
index 0000000000000000000000000000000000000000..eaf439aecca04e3aa5a022e0bc0b8b088efef7f1
--- /dev/null
+++ b/SUPPORT.md
@@ -0,0 +1,25 @@
+# TODO: The maintainer of this repo has not yet edited this file
+
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+
+## Microsoft Support Policy  
+
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
diff --git a/assets/method2_xyz.png b/assets/method2_xyz.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e59befe1ac1b151f8b521a7091365c51fb05e88
--- /dev/null
+++ b/assets/method2_xyz.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8130e404c78da0dbe1cfd33ab1a50d59b3ec40d72cfd12718ec6568fbc3a757
+size 2181784
diff --git a/assets/som_bench_bottom.jpg b/assets/som_bench_bottom.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eeb112f1e21184c34c8beff4c8daea20b4f925b1
Binary files /dev/null and b/assets/som_bench_bottom.jpg differ
diff --git a/assets/som_bench_upper.jpg b/assets/som_bench_upper.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..89c52890bae0e0b5399af02cab44958d820693e5
Binary files /dev/null and b/assets/som_bench_upper.jpg differ
diff --git a/assets/som_gpt4v_demo.mp4 b/assets/som_gpt4v_demo.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..d859307ce085a36313336b924d6ee93b9ea8aa56
--- /dev/null
+++ b/assets/som_gpt4v_demo.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a8f8b077dcbe8f7b693b51045a0ded80a1681565c793c2dba3c90d3836b5c4
+size 50609514
diff --git a/assets/som_logo.png b/assets/som_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7f62820ebef45af6d6b2716c8f217c233e103f0
Binary files /dev/null and b/assets/som_logo.png differ
diff --git a/assets/som_toolbox_interface.jpg b/assets/som_toolbox_interface.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..85cf3e7dfe75a646f5bff655d28d425b9c61521f
Binary files /dev/null and b/assets/som_toolbox_interface.jpg differ
diff --git a/assets/teaser.png b/assets/teaser.png
new file mode 100644
index 0000000000000000000000000000000000000000..603cc9f531752a626f91979738db697b70b64a40
--- /dev/null
+++ b/assets/teaser.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1deafef2f8fafb699896857e43cbb169d73bb6f564253233664d083ebb4cfa0c
+size 6328622
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..018214c750ea1ea37fb4a98b3e2fa4413df2e9f5
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,96 @@
+# SoM-Bench: Evaluating Visual Grounding with Visual Prompting
+
+We build a new benchmark called SoM-Bench to evaluate the visual grounding capability of LLMs with visual prompting.
+
+## Dataset
+
+| Vision Taks |  Source |  #Images | #Instances | Marks | Metric | Data
+| -------- | -------- | -------- | -------- | -------- | -------- | -------- |
+| Open-Vocab Segmentation | [COCO](https://cocodataset.org/#home) | 100 | 567 | Numeric IDs and Masks | Precision | [Download](https://github.com/microsoft/SoM/releases/download/v1.0/coco_ovseg.zip)
+| Open-Vocab Segmentation | [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/) | 100 | 488 | Numeric IDs and Masks | Precision | [Download](https://github.com/microsoft/SoM/releases/download/v1.0/ade20k_ovseg.zip)
+| Phrase Grounding | [Flickr30K](https://shannon.cs.illinois.edu/DenotationGraph/) | 100 | 274 | Numeric IDs and Masks and Boxes | Recall @ 1 | [Download](https://github.com/microsoft/SoM/releases/download/v1.0/flickr30k_grounding.zip)
+| Referring Comprehension | [RefCOCO](https://github.com/lichengunc/refer) | 100 | 177 | Numeric IDs and Masks | ACC @ 0.5 | [Download](https://github.com/microsoft/SoM/releases/download/v1.0/refcocog_refseg.zip)
+| Referring Segmentation | [RefCOCO](https://github.com/lichengunc/refer) | 100 | 177 | Numeric IDs and Masks | mIoU | [Download](https://github.com/microsoft/SoM/releases/download/v1.0/refcocog_refseg.zip)
+
+## Dataset Structure
+
+### Open-Vocab Segmentation on COCO
+
+We provide COCO in the following structure:
+
+```
+coco_ovseg
+├── som_images
+    ├── 000000000285_0.jpg
+    ├── 000000000872_0.jpg
+    |── 000000000872_5.jpg
+    ├── ...
+    ├── 000000002153_5.jpg
+    └── 000000002261_0.jpg
+```
+
+For some of the samples, the regions are very dense, so we split the regions into multiple groups of size 5,. For example, `000000000872_0.jpg` has 5 regions, and `000000000872_5.jpg` has the other 5 regions. Note that you can use the image_id to track the original image.
+
+We used the following language prompt for the task:
+```
+I have labeled a bright numeric ID at the center for each visual object in the image. Please enumerate their names. You must answer by selecting from the following names: [COCO Vocabulary]
+```
+
+### Open-Vocab Segmentation on ADE20K
+
+```
+ade20k_ovseg
+├── som_images
+    ├── ADE_val_00000001_0.jpg
+    ├── ADE_val_00000001_5.jpg
+    |── ADE_val_00000011_5.jpg
+    ├── ...
+    ├── ADE_val_00000039_5.jpg
+    └── ADE_val_00000040_0.jpg
+```
+Similar to COCO, the regions in ADE20K are also very dense, so we split the regions into multiple groups of size 5,. For example, `ADE_val_00000001_0.jpg` has 5 regions, and `ADE_val_00000001_5.jpg` has the other 5 regions. Note that you can use the image_id to track the original image.
+
+We used the following language prompt for the task:
+```
+I have labeled a bright numeric ID at the center for each visual object in the image. Please enumerate their names. You must answer by selecting from the following names: [ADE20K Vocabulary]
+```
+
+### Phrase Grounding on Flickr30K
+
+```
+flickr30k_grounding
+├── som_images
+    ├── 14868339.jpg
+    ├── 14868339_wbox.jpg
+    |── 14868339.json
+    ├── ...
+    ├── 302740416.jpg
+    |── 319185571_wbox.jpg
+    └── 302740416.json
+```
+
+For Flickr30K, we provide the image with numeric IDs and masks, and also the image with additional bounding boxes. The json file containing the ground truth bounding boxes and the corresponding phrases. Note that the bounding boxes are in the format of [x1, y1, x2, y2].
+
+We used the following language prompt for the task:
+```
+I have labeled a bright numeric ID at the center for each visual object in the image. Given the image showing a man in glasses holding a piece of paper, find the corresponding regions for a man in glasses, a piece of paper.
+```
+
+### Referring Expression Comprehension and Segmentation on RefCOCOg
+
+```
+refcocog_refseg
+├── som_images
+    ├── 000000000795.jpg
+    |── 000000000795.json
+    ├── ...
+    |── 000000007852.jpg
+    └── 000000007852.json
+```
+
+For RefCOCOg, we provide the image with numeric IDs and masks, and also the json file containing the referring expressions and the corresponding referring ids. 
+
+We used the following language prompt for the task:
+```
+I have labeled a bright numeric ID at the center for each visual object in the image. Please tell me the IDs for: The laptop behind the beer bottle; Laptop turned on.
+```
diff --git a/client.py b/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb844351d224f4cc6e9aceaecb9108bbba16e52a
--- /dev/null
+++ b/client.py
@@ -0,0 +1,36 @@
+"""
+This module provides a command-line interface to interact with the SoM server.
+
+The server URL is printed during deployment via `python deploy.py run`.
+
+Usage:
+    python client.py "http://<server_ip>:6092"
+"""
+
+import fire
+from gradio_client import Client
+from loguru import logger
+
+def predict(server_url: str):
+    """
+    Makes a prediction using the Gradio client with the provided IP address.
+
+    Args:
+        server_url (str): The URL of the SoM Gradio server.
+    """
+    client = Client(server_url)
+    result = client.predict(
+        {
+            "background": "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
+        },           # filepath in 'parameter_1' Image component
+        2.5,         # float (numeric value between 1 and 3) in 'Granularity' Slider component
+        "Automatic", # Literal['Automatic', 'Interactive'] in 'Segmentation Mode' Radio component
+        0.5,         # float (numeric value between 0 and 1) in 'Mask Alpha' Slider component
+        "Number",    # Literal['Number', 'Alphabet'] in 'Mark Mode' Radio component
+        ["Mark"],    # List[Literal['Mask', 'Box', 'Mark']] in 'Annotation Mode' Checkboxgroup component
+        api_name="/inference"
+    )
+    logger.info(result)
+
+if __name__ == "__main__":
+    fire.Fire(predict)
diff --git a/configs/seem_focall_unicl_lang_v1.yaml b/configs/seem_focall_unicl_lang_v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23efe54cc1175136a1ccdf7f79dd135be8bb7747
--- /dev/null
+++ b/configs/seem_focall_unicl_lang_v1.yaml
@@ -0,0 +1,401 @@
+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou (xueyan@cs.wisc.edu)
+# --------------------------------------------------------
+
+# Define Test/Trainer/Saving
+PIPELINE: XDecoderPipeline
+TRAINER: xdecoder
+SAVE_DIR: '../../data/output/test'
+base_path: "./"
+
+# Resume Logistic
+RESUME: false
+WEIGHT: false
+RESUME_FROM: ''
+EVAL_AT_START: False
+
+# Logging and Debug
+WANDB: False
+LOG_EVERY: 100
+FIND_UNUSED_PARAMETERS: false
+
+# Speed up training
+FP16: false
+PORT: '36873'
+
+# misc
+LOADER:
+  JOINT: False
+  KEY_DATASET: 'coco'
+
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: seem_model_v1
+  HEAD: xdecoder_head
+  MASK_ON: false
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  DIM_PROJ: 512
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal
+    PRETRAINED: ''
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [2, 2, 18, 2]
+      FOCAL_LEVELS: [4, 4, 4, 4]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: seem_v1
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK:
+      ENABLED: True
+    DETECTION: False
+    SPATIAL:
+      ENABLED: True
+      MAX_ITER: 1
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    RETRIEVAL:
+      ENABLED: False
+    LVIS:
+      ENABLED: True
+      THRES: 0.7
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.5
+      SIM_THRES: 0.95
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    GCLASS_WEIGHT: 0.4
+    GMASK_WEIGHT: 1.0
+    GDICE_WEIGHT: 1.0
+    SCLASS_WEIGHT: 0.4
+    SMASK_WEIGHT: 1.0
+    SDICE_WEIGHT: 1.0
+    OCLASS_WEIGHT: 0.4
+    OMASK_WEIGHT: 1.0
+    ODICE_WEIGHT: 1.0
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BBOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    CAPTION_WEIGHT: 2.0
+    COST_SPATIAL:
+      CLASS_WEIGHT: 5.0
+      MASK_WEIGHT: 2.0
+      DICE_WEIGHT: 2.0
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    MAX_SPATIAL_LEN: [512, 512, 512, 512]
+    # ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 10
+    TOP_CAPTION_LAYERS: 10
+    TOP_SPATIAL_LAYERS: 10
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.8
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+
+# Spatial sampler
+STROKE_SAMPLER:
+  MAX_CANDIDATE: 1
+  CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
+  CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
+  DILATION: 3
+  CIRCLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
+    STROKE_PROB: [0.33, 0.33, 0.33]
+  SCRIBBLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['rand_curve', 'rand_curve_small']
+    STROKE_PROB: [0.5, 0.5]
+  POINT:
+    NUM_POINTS: 20
+  POLYGON:
+    MAX_POINTS: 9
+  EVAL:
+    MODE: 'best' # best/random/best_random
+    NEGATIVE: False
+    MAX_ITER: 20
+    IOU_ITER: 1
+    GROUNDING: False
+
+# Multi-modal Architecture, order matters
+ATTENTION_ARCH:
+  VARIABLE:
+    queries: ['object', 'grounding', 'spatial']
+    tokens: ['grounding', 'spatial']
+    memories: ['spatial']
+  SELF_ATTENTION:
+    queries:
+      object: ['queries_object']
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
+    tokens:
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['tokens_spatial']
+    memories:
+      spatial: ['memories_spatial']
+  CROSS_ATTENTION:
+    queries:
+      object: True
+      grounding: True
+      spatial: True
+    memories:
+      spatial: True
+    tokens:
+      grounding: False
+      spatial: False
+  MASKING: ['tokens_spatial', 'tokens_grounding']
+  DUPLICATION:
+    queries:
+      grounding: 'queries_object'
+      spatial: 'queries_object'
+  SPATIAL_MEMORIES: 32
+  QUERY_NUMBER: 3
+
+DATASETS:
+  TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
+  # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
+  TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"]  # to evaluate instance and semantic performance as well
+  # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  CLASS_CONCAT: false
+  SIZE_DIVISIBILITY: 32
+  PROPOSAL_FILES_TRAIN: []
+
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+
+TRAIN:
+  ASPECT_RATIO_GROUPING: true
+  BATCH_SIZE_TOTAL: 4
+  BATCH_SIZE_PER_GPU: 4
+  SHUFFLE: true
+
+TEST:
+  DETECTIONS_PER_IMAGE: 100
+  NAME: coco_eval
+  IOU_TYPE: ['bbox', 'segm']
+  USE_MULTISCALE: false
+  BATCH_SIZE_TOTAL: 8
+  MODEL_FILE: ''
+  AUG:
+    ENABLED: False
+
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 8
+  LOAD_PROPOSALS: False
+  SAMPLER_TRAIN: "TrainingSampler"
+  ASPECT_RATIO_GROUPING: True
+
+COCO:
+  INPUT:
+    MIN_SIZE_TRAIN: 800
+    MAX_SIZE_TRAIN: 1333
+    MIN_SIZE_TRAIN_SAMPLING: 'choice'
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.1
+    MAX_SCALE: 2.0
+    DATASET_MAPPER_NAME: "coco_interactive"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'coco'
+
+# Validation dataset
+ADE20K:
+  INPUT:
+    MIN_SIZE_TRAIN: 640
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 640
+    MAX_SIZE_TRAIN: 2560
+    MAX_SIZE_TEST: 2560
+    MASK_FORMAT: "polygon"
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: (640, 640)
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    FORMAT: "RGB"
+  DATASET:
+    DATASET: 'ade'
+
+SBD:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 1
+
+VOC:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+DAVIS:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+VOS:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 1
+
+REF:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+    FORMAT: "RGB"
+    SPATIAL: False
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 4
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+# Detectron2 training config for optimizer and lr scheduler
+SOLVER:
+  BASE_LR: 0.0001
+  STEPS: [0.88889, 0.96296]
+  MAX_ITER: 1
+  GAMMA: 0.1
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WARMUP_METHOD: "linear"
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+  LR_MULTIPLIER:
+    backbone: 0.1
+    lang_encoder: 0.1
+  FIX_PARAM:
+    backbone: True
+    lang_encoder: True
+    pixel_decoder: True
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 5.0 # 0.01
+    NORM_TYPE: 2.0
+  MAX_NUM_EPOCHS: 50
\ No newline at end of file
diff --git a/configs/semantic_sam_only_sa-1b_swinL.yaml b/configs/semantic_sam_only_sa-1b_swinL.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93abac652fb6ddc425338d0827459dfbe884aa68
--- /dev/null
+++ b/configs/semantic_sam_only_sa-1b_swinL.yaml
@@ -0,0 +1,524 @@
+# ------------------------------------------------------------------------
+# Semantic SAM
+# Copyright (c) MicroSoft, Inc. and its affiliates.
+# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
+# ------------------------------------------------------------------------
+
+##################
+# Task settings
+##################
+WEIGHT: ''
+PORT: 53711
+VERBOSE: true
+
+OUTPUT_DIR: '../../data/output/test'
+# misc
+LOADER:
+  JOINT: True
+  KEY_DATASET: 'coco'
+# model
+MODEL:
+  NAME: interactive_mask_dino
+  HEAD: general_head
+  MASK_ON: false
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  DIM_PROJ: 512
+  BACKBONE_DIM: 768
+  BACKGROUND: False
+  WEIGHTS: ''
+  TEXT:
+    ARCH: noencoder  # no language encoder for training only sa-1b data
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 18 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: swin
+    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
+    LOAD_PRETRAINED: true
+    SWIN:
+      PRETRAIN_IMG_SIZE: 384
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [ 2, 2, 18, 2 ]
+      NUM_HEADS: [ 6, 12, 24, 48 ]
+      WINDOW_SIZE: 12
+      MLP_RATIO: 4.0
+      QKV_BIAS: true
+      QK_SCALE: ~
+      DROP_RATE: 0.0
+      ATTN_DROP_RATE: 0.0
+      DROP_PATH_RATE: 0.3
+      APE: false
+      PATCH_NORM: true
+      USE_CHECKPOINT: false
+      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
+  ENCODER:
+    NAME: encoder_deform
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 1
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    NUM_FEATURE_LEVELS: 3
+    FEATURE_ORDER: "low2high"
+  DECODER:
+    NAME: interactive_mask_dino
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK: True
+    BOX: True
+    PART: True
+    GROUNDING:
+      ENABLED: False
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.0
+      SIM_THRES: 0.95
+    CAPTIONING:
+      ENABLED: False
+      STEP: 50
+    RETRIEVAL:
+      ENABLED: False
+      DIM_IMG: 768
+      ENSEMBLE: True
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 4.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    IOU_WEIGHT: 1.0
+    COST_CLASS_WEIGHT: 4.0
+    COST_DICE_WEIGHT: 5.0
+    COST_MASK_WEIGHT: 5.0
+    COST_BOX_WEIGHT: 5.0
+    COST_GIOU_WEIGHT: 2.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 0
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    TWO_STAGE: False
+    INITIALIZE_BOX_TYPE: 'no'
+    DN: seg
+    DN_NOISE_SCALE: 0.4
+    DN_NUM: 100
+    INITIAL_PRED: False
+    LEARN_TGT: False
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    SEMANTIC_CE_LOSS: False
+    PANO_BOX_LOSS: False
+    COCO: False
+    O365: False
+    SAM: True
+    PASCAL: False
+    RE_POINT: True
+    NUM_INTERACTIVE_TOKENS: 6
+    MAX_NUM_INSTANCE: 60
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      BOX_INTERACTIVE: False
+      CLASSIFICATION_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.25
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+      TEST_FOUCUS_ON_BOX: False
+      PANO_TRANSFORM_EVAL: True
+      PANO_TEMPERATURE: 0.06
+
+TEST:
+  EVAL_PERIOD: 500000
+  PRECISE_BN:
+    NUM_ITER: 1
+    ENABLED: False
+  AUG:
+    ENABLED: False
+
+SAM:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.99
+    MAX_SCALE: 1.01
+    DATASET_MAPPER_NAME: "sam"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'sam'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 4
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+COCO:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.1
+    MAX_SCALE: 2.0
+    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'coco'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 1
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 2
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+VLP:
+  INPUT:
+    IMAGE_SIZE: 224
+    DATASET_MAPPER_NAME: "vlpretrain"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  TRAIN:
+    BATCH_SIZE_TOTAL: 2
+    BATCH_SIZE_PER_GPU: 2
+  TEST:
+    BATCH_SIZE_TOTAL: 256
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 16
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+
+DATASETS:
+  TRAIN: ["sam_train"]
+  # interactive segmentation evaluation.
+  TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
+#  TEST: ["sam_minival"]
+
+  CLASS_CONCAT: false
+  SIZE_DIVISIBILITY: 32
+  PROPOSAL_FILES_TRAIN: []
+
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 16
+  LOAD_PROPOSALS: False
+  SAMPLER_TRAIN: "TrainingSampler"
+  ASPECT_RATIO_GROUPING: True
+
+# Detectron2 training config for optimizer and lr scheduler
+SOLVER:
+  BASE_LR_END: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: False
+  CHECKPOINT_PERIOD: 5000
+  IMS_PER_BATCH: 1
+  REFERENCE_WORLD_SIZE: 0
+  BIAS_LR_FACTOR: 1.0
+  WEIGHT_DECAY_BIAS: None
+  # original
+  BASE_LR: 0.0001
+  STEPS: [327778, 355092]
+  MAX_ITER: 368750
+  GAMMA: 0.1
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WARMUP_METHOD: "linear"
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+  LR_MULTIPLIER:
+    backbone: 0.1
+    lang_encoder: 0.1
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+
+# Evaluation Dataset
+ADE20K:
+  INPUT:
+    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 640
+    MAX_SIZE_TRAIN: 2560
+    MAX_SIZE_TEST: 2560
+    MASK_FORMAT: "polygon"
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: [640, 640]
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    FORMAT: "RGB"
+  DATASET:
+    DATASET: 'ade'
+  TRAIN:
+    ASPECT_RATIO_GROUPING: true
+    BATCH_SIZE_TOTAL: 16
+    BATCH_SIZE_PER_GPU: 2
+    SHUFFLE: true
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 8
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+#ADE20K:
+#  INPUT:
+#    MIN_SIZE_TRAIN: 640
+#    MIN_SIZE_TRAIN_SAMPLING: "choice"
+#    MIN_SIZE_TEST: 640
+#    MAX_SIZE_TRAIN: 2560
+#    MAX_SIZE_TEST: 2560
+#    MASK_FORMAT: "polygon"
+#    CROP:
+#      ENABLED: True
+#      TYPE: "absolute"
+#      SIZE: (640, 640)
+#      SINGLE_CATEGORY_MAX_AREA: 1.0
+#    COLOR_AUG_SSD: True
+#    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+#    DATASET_MAPPER_NAME: "mask_former_panoptic"
+#    FORMAT: "RGB"
+#  DATASET:
+#    DATASET: 'ade'
+#  TEST:
+#    BATCH_SIZE_TOTAL: 8
+
+
+REF:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+    FORMAT: "RGB"
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+SUN:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+SCAN:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+BDD:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+CITY:
+  INPUT:
+    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 1024
+    MAX_SIZE_TRAIN: 4096
+    MAX_SIZE_TEST: 2048
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: [ 512, 1024 ]
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: -1
+    FORMAT: "RGB"
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    MASK_FORMAT: "polygon"
+    TEST:
+      EVAL_PERIOD: 5000
+      BATCH_SIZE_TOTAL: 1
+      AUG:
+        ENABLED: False
+        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
+        MAX_SIZE: 4096
+        FLIP: True
+    DATALOADER:
+      FILTER_EMPTY_ANNOTATIONS: True
+      NUM_WORKERS: 2
+      LOAD_PROPOSALS: False
+      SAMPLER_TRAIN: "TrainingSampler"
+      ASPECT_RATIO_GROUPING: True
+    TRAIN:
+      ASPECT_RATIO_GROUPING: true
+      BATCH_SIZE_TOTAL: 2
+      BATCH_SIZE_PER_GPU: 2
+      SHUFFLE: true
+
+PSACAL_PART:
+  INPUT:
+      MIN_SIZE_TEST: 800
+      MAX_SIZE_TEST: 1333
+      IMAGE_SIZE: 1024
+      MIN_SCALE: 0.1
+      MAX_SCALE: 2.0
+      DATASET_MAPPER_NAME: "pascal_part_lsj"
+      IGNORE_VALUE: 255
+      COLOR_AUG_SSD: False
+      SIZE_DIVISIBILITY: 32
+      RANDOM_FLIP: "horizontal"
+      MASK_FORMAT: "polygon"
+      FORMAT: "RGB"
+      CROP:
+        ENABLED: True
+  MODEL:
+    MASK_ON: True
+    KEYPOINT_ON: False
+    LOAD_PROPOSALS: False
+  # DATASET:
+  #   DATASET: 'coco'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 2
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
diff --git a/demo_gpt4v_som.py b/demo_gpt4v_som.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd78feb056ed8420e0e14d64df5b045894254e87
--- /dev/null
+++ b/demo_gpt4v_som.py
@@ -0,0 +1,226 @@
+# --------------------------------------------------------
+# Set-of-Mark (SoM) Prompting for Visual Grounding in GPT-4V
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by:
+#   Jianwei Yang (jianwyan@microsoft.com)
+#   Xueyan Zou (xueyan@cs.wisc.edu)
+#   Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+import io
+import gradio as gr
+import torch
+import argparse
+from PIL import Image
+# seem
+from seem.modeling.BaseModel import BaseModel as BaseModel_Seem
+from seem.utils.distributed import init_distributed as init_distributed_seem
+from seem.modeling import build_model as build_model_seem
+from task_adapter.seem.tasks import interactive_seem_m2m_auto, inference_seem_pano, inference_seem_interactive
+
+# semantic sam
+from semantic_sam.BaseModel import BaseModel
+from semantic_sam import build_model
+from semantic_sam.utils.dist import init_distributed_mode
+from semantic_sam.utils.arguments import load_opt_from_config_file
+from semantic_sam.utils.constants import COCO_PANOPTIC_CLASSES
+from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_switch
+
+# sam
+from segment_anything import sam_model_registry
+from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto
+from task_adapter.sam.tasks.inference_sam_m2m_interactive import inference_sam_m2m_interactive
+
+
+from task_adapter.utils.visualizer import Visualizer
+from detectron2.data import MetadataCatalog
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from scipy.ndimage import label
+import numpy as np
+
+from gpt4v import request_gpt4v
+from openai import OpenAI
+from pydub import AudioSegment
+from pydub.playback import play
+
+import matplotlib.colors as mcolors
+css4_colors = mcolors.CSS4_COLORS
+color_proposals = [list(mcolors.hex2color(color)) for color in css4_colors.values()]
+
+client = OpenAI()
+
+'''
+build args
+'''
+semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml"
+seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml"
+
+semsam_ckpt = "./swinl_only_sam_many2many.pth"
+sam_ckpt = "./sam_vit_h_4b8939.pth"
+seem_ckpt = "./seem_focall_v1.pt"
+
+opt_semsam = load_opt_from_config_file(semsam_cfg)
+opt_seem = load_opt_from_config_file(seem_cfg)
+opt_seem = init_distributed_seem(opt_seem)
+
+
+'''
+build model
+'''
+model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda()
+model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda()
+model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda()
+
+with torch.no_grad():
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        model_seem.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
+
+history_images = []
+history_masks = []
+history_texts = []
+@torch.no_grad()
+def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs):
+    global history_images; history_images = []
+    global history_masks; history_masks = []    
+
+    _image = image['background'].convert('RGB')
+    _mask = image['layers'][0].convert('L') if image['layers'] else None
+
+    if slider < 1.5:
+        model_name = 'seem'
+    elif slider > 2.5:
+        model_name = 'sam'
+    else:
+        if mode == 'Automatic':
+            model_name = 'semantic-sam'
+            if slider < 1.5 + 0.14:                
+                level = [1]
+            elif slider < 1.5 + 0.28:
+                level = [2]
+            elif slider < 1.5 + 0.42:
+                level = [3]
+            elif slider < 1.5 + 0.56:
+                level = [4]
+            elif slider < 1.5 + 0.70:
+                level = [5]
+            elif slider < 1.5 + 0.84:
+                level = [6]
+            else:
+                level = [6, 1, 2, 3, 4, 5]
+        else:
+            model_name = 'sam'
+
+
+    if label_mode == 'Alphabet':
+        label_mode = 'a'
+    else:
+        label_mode = '1'
+
+    text_size, hole_scale, island_scale=640,100,100
+    text, text_part, text_thresh = '','','0.0'
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        semantic=False
+
+        if mode == "Interactive":
+            labeled_array, num_features = label(np.asarray(_mask))
+            spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
+
+        if model_name == 'semantic-sam':
+            model = model_semsam
+            output, mask = inference_semsam_m2m_auto(model, _image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
+
+        elif model_name == 'sam':
+            model = model_sam
+            if mode == "Automatic":
+                output, mask = inference_sam_m2m_auto(model, _image, text_size, label_mode, alpha, anno_mode)
+            elif mode == "Interactive":
+                output, mask = inference_sam_m2m_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
+
+        elif model_name == 'seem':
+            model = model_seem
+            if mode == "Automatic":
+                output, mask = inference_seem_pano(model, _image, text_size, label_mode, alpha, anno_mode)
+            elif mode == "Interactive":
+                output, mask = inference_seem_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
+
+        # convert output to PIL image
+        history_masks.append(mask)
+        history_images.append(Image.fromarray(output))
+        return (output, [])
+
+
+def gpt4v_response(message, history):
+    global history_images
+    global history_texts; history_texts = []    
+    try:
+        res = request_gpt4v(message, history_images[0])
+        history_texts.append(res)
+        return res
+    except Exception as e:
+        return None
+
+def highlight(mode, alpha, label_mode, anno_mode, *args, **kwargs):
+    res = history_texts[0]
+    # find the seperate numbers in sentence res
+    res = res.split(' ')
+    res = [r.replace('.','').replace(',','').replace(')','').replace('"','') for r in res]
+    # find all numbers in '[]'
+    res = [r for r in res if '[' in r]
+    res = [r.split('[')[1] for r in res]
+    res = [r.split(']')[0] for r in res]
+    res = [r for r in res if r.isdigit()]
+    res = list(set(res))
+    sections = []
+    for i, r in enumerate(res):
+        mask_i = history_masks[0][int(r)-1]['segmentation']
+        sections.append((mask_i, r))
+    return (history_images[0], sections)
+
+'''
+launch app
+'''
+
+demo = gr.Blocks()
+image = gr.ImageMask(label="Input", type="pil", sources=["upload"], interactive=True, brush=gr.Brush(colors=["#FFFFFF"]))
+slider = gr.Slider(1, 3, value=1.8, label="Granularity") # info="Choose in [1, 1.5), [1.5, 2.5), [2.5, 3] for [seem, semantic-sam (multi-level), sam]"
+mode = gr.Radio(['Automatic', 'Interactive', ], value='Automatic', label="Segmentation Mode")
+anno_mode = gr.CheckboxGroup(choices=["Mark", "Mask", "Box"], value=['Mark'], label="Annotation Mode")
+image_out = gr.AnnotatedImage(label="SoM Visual Prompt", height=512)
+runBtn = gr.Button("Run")
+highlightBtn = gr.Button("Highlight")
+bot = gr.Chatbot(label="GPT-4V + SoM", height=256)
+slider_alpha = gr.Slider(0, 1, value=0.05, label="Mask Alpha") #info="Choose in [0, 1]"
+label_mode = gr.Radio(['Number', 'Alphabet'], value='Number', label="Mark Mode")
+
+title = "Set-of-Mark (SoM) Visual Prompting for Extraordinary Visual Grounding in GPT-4V"
+description = "This is a demo for SoM Prompting to unleash extraordinary visual grounding in GPT-4V. Please upload an image and them click the 'Run' button to get the image with marks. Then chat with GPT-4V below!"
+
+with demo:
+    gr.Markdown("<h1 style='text-align: center'><img src='https://som-gpt4v.github.io/website/img/som_logo.png' style='height:50px;display:inline-block'/>  Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V</h1>")
+    # gr.Markdown("<h2 style='text-align: center; margin-bottom: 1rem'>Project: <a href='https://som-gpt4v.github.io/'>link</a>     arXiv: <a href='https://arxiv.org/abs/2310.11441'>link</a>     Code: <a href='https://github.com/microsoft/SoM'>link</a></h2>")
+    with gr.Row():
+        with gr.Column():
+            image.render()
+            slider.render()
+            with gr.Accordion("Detailed prompt settings (e.g., mark type)", open=False):
+                with gr.Row():
+                    mode.render()
+                    anno_mode.render()
+                with gr.Row():
+                    slider_alpha.render()
+                    label_mode.render()
+        with gr.Column():
+            image_out.render()
+            runBtn.render()
+            highlightBtn.render()
+    with gr.Row():    
+        gr.ChatInterface(chatbot=bot, fn=gpt4v_response)
+
+    runBtn.click(inference, inputs=[image, slider, mode, slider_alpha, label_mode, anno_mode],
+              outputs = image_out)
+    highlightBtn.click(highlight, inputs=[image, mode, slider_alpha, label_mode, anno_mode],
+              outputs = image_out)
+
+demo.queue().launch(share=True,server_port=6092)
+
diff --git a/demo_som.py b/demo_som.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5fb699cbd893e881191bf798b75ea97ae4892f7
--- /dev/null
+++ b/demo_som.py
@@ -0,0 +1,181 @@
+# --------------------------------------------------------
+# Set-of-Mark (SoM) Prompting for Visual Grounding in GPT-4V
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by:
+#   Jianwei Yang (jianwyan@microsoft.com)
+#   Xueyan Zou (xueyan@cs.wisc.edu)
+#   Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import gradio as gr
+import torch
+import argparse
+
+# seem
+from seem.modeling.BaseModel import BaseModel as BaseModel_Seem
+from seem.utils.distributed import init_distributed as init_distributed_seem
+from seem.modeling import build_model as build_model_seem
+from task_adapter.seem.tasks import interactive_seem_m2m_auto, inference_seem_pano, inference_seem_interactive
+
+# semantic sam
+from semantic_sam.BaseModel import BaseModel
+from semantic_sam import build_model
+from semantic_sam.utils.dist import init_distributed_mode
+from semantic_sam.utils.arguments import load_opt_from_config_file
+from semantic_sam.utils.constants import COCO_PANOPTIC_CLASSES
+from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_switch
+
+# sam
+from segment_anything import sam_model_registry
+from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto
+from task_adapter.sam.tasks.inference_sam_m2m_interactive import inference_sam_m2m_interactive
+
+from scipy.ndimage import label
+import numpy as np
+
+'''
+build args
+'''
+semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml"
+seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml"
+
+semsam_ckpt = "./swinl_only_sam_many2many.pth"
+sam_ckpt = "./sam_vit_h_4b8939.pth"
+seem_ckpt = "./seem_focall_v1.pt"
+
+opt_semsam = load_opt_from_config_file(semsam_cfg)
+opt_seem = load_opt_from_config_file(seem_cfg)
+opt_seem = init_distributed_seem(opt_seem)
+
+
+'''
+build model
+'''
+model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda()
+model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda()
+model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda()
+
+with torch.no_grad():
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        model_seem.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
+
+@torch.no_grad()
+def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs):
+    _image = image['background'].convert('RGB')
+    _mask = image['layers'][0].convert('L') if image['layers'] else None
+
+    if slider < 1.5:
+        model_name = 'seem'
+    elif slider > 2.5:
+        model_name = 'sam'
+    else:
+        if mode == 'Automatic':
+            model_name = 'semantic-sam'
+            if slider < 1.5 + 0.14:
+                level = [1]
+            elif slider < 1.5 + 0.28:
+                level = [2]
+            elif slider < 1.5 + 0.42:
+                level = [3]
+            elif slider < 1.5 + 0.56:
+                level = [4]
+            elif slider < 1.5 + 0.70:
+                level = [5]
+            elif slider < 1.5 + 0.84:
+                level = [6]
+            else:
+                level = [6, 1, 2, 3, 4, 5]
+        else:
+            model_name = 'sam'
+
+
+    if label_mode == 'Alphabet':
+        label_mode = 'a'
+    else:
+        label_mode = '1'
+
+    text_size, hole_scale, island_scale=640,100,100
+    text, text_part, text_thresh = '','','0.0'
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        semantic=False
+
+        if mode == "Interactive":
+            labeled_array, num_features = label(np.asarray(_mask))
+            spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
+
+        if model_name == 'semantic-sam':
+            model = model_semsam
+            output, mask = inference_semsam_m2m_auto(model, _image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
+
+        elif model_name == 'sam':
+            model = model_sam
+            if mode == "Automatic":
+                output, mask = inference_sam_m2m_auto(model, _image, text_size, label_mode, alpha, anno_mode)
+            elif mode == "Interactive":
+                output, mask = inference_sam_m2m_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
+
+        elif model_name == 'seem':
+            model = model_seem
+            if mode == "Automatic":
+                output, mask = inference_seem_pano(model, _image, text_size, label_mode, alpha, anno_mode)
+            elif mode == "Interactive":
+                output, mask = inference_seem_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
+
+        return output
+
+'''
+launch app
+'''
+
+demo = gr.Blocks()
+image = gr.ImageMask(label="Input", type="pil", sources=["upload"], interactive=True, brush=gr.Brush(colors=["#FFFFFF"]))
+slider = gr.Slider(1, 3, value=2, label="Granularity", info="Choose in [1, 1.5), [1.5, 2.5), [2.5, 3] for [seem, semantic-sam (multi-level), sam]")
+mode = gr.Radio(['Automatic', 'Interactive', ], value='Automatic', label="Segmentation Mode")
+image_out = gr.Image(label="Auto generation",type="pil")
+runBtn = gr.Button("Run")
+slider_alpha = gr.Slider(0, 1, value=0.1, label="Mask Alpha", info="Choose in [0, 1]")
+label_mode = gr.Radio(['Number', 'Alphabet'], value='Number', label="Mark Mode")
+anno_mode = gr.CheckboxGroup(choices=["Mask", "Box", "Mark"], value=['Mask', 'Mark'], label="Annotation Mode")
+
+title = "Set-of-Mark (SoM) Prompting for Visual Grounding in GPT-4V"
+description = "This is a demo for SoM Prompting to unleash extraordinary visual grounding in GPT-4V. Please upload an image and them click the 'Run' button to get the image with marks. Then try it on <a href='https://chat.openai.com/'>GPT-4V<a>!"
+
+with demo:
+    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
+    gr.Markdown("<h3 style='text-align: center; margin-bottom: 1rem'>project: <a href='https://som-gpt4v.github.io/'>link</a>, arXiv: <a href='https://arxiv.org/abs/2310.11441'>link</a>, code: <a href='https://github.com/microsoft/SoM'>link</a></h3>")
+    gr.Markdown(f"<h3 style='margin-bottom: 1rem'>{description}</h3>")
+    with gr.Row():
+        with gr.Column():
+            image.render()
+            slider.render()
+            with gr.Row():
+                mode.render()
+                anno_mode.render()
+            with gr.Row():
+                slider_alpha.render()
+                label_mode.render()
+        with gr.Column():
+            image_out.render()
+            runBtn.render()
+    with gr.Row():
+        example = gr.Examples(
+            examples=[
+                ["examples/ironing_man.jpg"],
+            ],
+            inputs=image,
+            cache_examples=False,
+        )
+        example = gr.Examples(
+            examples=[
+                ["examples/ironing_man_som.png"],
+            ],
+            inputs=image,
+            cache_examples=False,
+            label='Marked Examples',
+        )
+
+    runBtn.click(inference, inputs=[image, slider, mode, slider_alpha, label_mode, anno_mode],
+              outputs = image_out)
+
+demo.queue().launch(share=True,server_port=6092)
diff --git a/deploy.py b/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..79fbf713e8aed7c9bdc071850ee8178bf845200c
--- /dev/null
+++ b/deploy.py
@@ -0,0 +1,720 @@
+"""Deploy SoM to AWS EC2 via Github action.
+
+Usage:
+
+    1. Create and populate the .env file:
+
+        cat > .env <<EOF
+AWS_ACCESS_KEY_ID=<your aws access key id>
+AWS_SECRET_ACCESS_KEY=<your aws secret access key (required)>
+AWS_REGION=<your aws region (required)>
+GITHUB_OWNER=<your github owner (required)>  # e.g. microsoft
+GITHUB_REPO=<your github repo (required)>    # e.g. SoM
+GITHUB_TOKEN=<your github token (required)>
+PROJECT_NAME=<your project name (required)>  # for tagging AWS resources
+OPENAI_API_KEY=<your openai api key (optional)>
+EOF
+
+    2. Create a virtual environment for deployment:
+
+        python3.10 -m venv venv
+        source venv/bin/activate
+        pip install -r deploy_requirements.txt
+
+    3. Run the deployment script:
+
+        python deploy.py start
+
+    4. Wait for the build to succeed in Github actions (see console output for URL)
+
+    5. Open the gradio interface (see console output for URL) and test it out.
+       Note that it may take a minute for the interface to become available.
+       You can also interact with the server programmatically:
+
+        python client.py "http://<server_ip>:6092"
+
+    6. Terminate the EC2 instance and stop incurring charges:
+
+        python deploy.py stop
+
+       Or, to shut it down without removing it:
+
+        python deploy.py pause
+
+       (This can later be re-started with the `start` command.)
+
+    7. (optional) List all tagged instances with their respective statuses:
+
+        python deploy.py status
+
+Troubleshooting Token Scope Error:
+
+    If you encounter an error similar to the following when pushing changes to
+    GitHub Actions workflow files:
+
+        ! [remote rejected] feat/docker -> feat/docker (refusing to allow a
+        Personal Access Token to create or update workflow
+        `.github/workflows/docker-build-ec2.yml` without `workflow` scope)
+
+    This indicates that the Personal Access Token (PAT) being used does not
+    have the necessary permissions ('workflow' scope) to create or update GitHub
+    Actions workflows. To resolve this issue, you will need to create or update
+    your PAT with the appropriate scope.
+
+    Creating or Updating a Classic PAT with 'workflow' Scope:
+
+    1. Go to GitHub and sign in to your account.
+    2. Click on your profile picture in the top right corner, and then click 'Settings'.
+    3. In the sidebar, click 'Developer settings'.
+    4. Click 'Personal access tokens', then 'Classic tokens'.
+    5. To update an existing token:
+       a. Find the token you wish to update in the list and click on it.
+       b. Scroll down to the 'Select scopes' section.
+       c. Make sure the 'workflow' scope is checked. This scope allows for
+          managing GitHub Actions workflows.
+       d. Click 'Update token' at the bottom of the page.
+    6. To create a new token:
+       a. Click 'Generate new token'.
+       b. Give your token a descriptive name under 'Note'.
+       c. Scroll down to the 'Select scopes' section.
+       d. Check the 'workflow' scope to allow managing GitHub Actions workflows.
+       e. Optionally, select any other scopes needed for your project.
+       f. Click 'Generate token' at the bottom of the page.
+    7. Copy the generated token. Make sure to save it securely, as you will not
+       be able to see it again.
+
+    After creating or updating your PAT with the 'workflow' scope, update the
+    Git remote configuration to use the new token, and try pushing your changes
+    again.
+
+    Note: Always keep your tokens secure and never share them publicly.
+
+"""
+
+import base64
+import json
+import os
+import subprocess
+import time
+
+from botocore.exceptions import ClientError
+from jinja2 import Environment, FileSystemLoader
+from loguru import logger
+from nacl import encoding, public
+from pydantic_settings import BaseSettings
+import boto3
+import fire
+import git
+import paramiko
+import requests
+
+class Config(BaseSettings):
+    AWS_ACCESS_KEY_ID: str
+    AWS_SECRET_ACCESS_KEY: str
+    AWS_REGION: str
+    GITHUB_OWNER: str
+    GITHUB_REPO: str
+    GITHUB_TOKEN: str
+    OPENAI_API_KEY: str | None = None
+    PROJECT_NAME: str
+
+    AWS_EC2_AMI: str = "ami-0f9c346cdcac09fb5"  # Deep Learning AMI GPU PyTorch 2.0.1 (Ubuntu 20.04) 20230827
+    AWS_EC2_DISK_SIZE: int = 100  # GB
+    #AWS_EC2_INSTANCE_TYPE: str = "p3.2xlarge"  # (V100 16GB $3.06/hr x86_64)
+    AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge"  # (T4 16GB $0.526/hr x86_64)
+    AWS_EC2_USER: str = "ubuntu"
+
+    class Config:
+        env_file = ".env"
+        env_file_encoding = 'utf-8'
+
+    @property
+    def AWS_EC2_KEY_NAME(self) -> str:
+        return f"{self.PROJECT_NAME}-key"
+
+    @property
+    def AWS_EC2_KEY_PATH(self) -> str:
+        return f"./{self.AWS_EC2_KEY_NAME}.pem"
+
+    @property
+    def AWS_EC2_SECURITY_GROUP(self) -> str:
+        return f"{self.PROJECT_NAME}-SecurityGroup"
+
+    @property
+    def AWS_SSM_ROLE_NAME(self) -> str:
+        return f"{self.PROJECT_NAME}-SSMRole"
+
+    @property
+    def AWS_SSM_PROFILE_NAME(self) -> str:
+        return f"{self.PROJECT_NAME}-SSMInstanceProfile"
+
+    @property
+    def GITHUB_PATH(self) -> str:
+        return f"{self.GITHUB_OWNER}/{self.GITHUB_REPO}"
+
+config = Config()
+
+def encrypt(public_key: str, secret_value: str) -> str:
+    """
+    Encrypts a Unicode string using the provided public key.
+
+    Args:
+        public_key (str): The public key for encryption, encoded in Base64.
+        secret_value (str): The Unicode string to be encrypted.
+
+    Returns:
+        str: The encrypted value, encoded in Base64.
+    """
+    public_key = public.PublicKey(public_key.encode("utf-8"), encoding.Base64Encoder())
+    sealed_box = public.SealedBox(public_key)
+    encrypted = sealed_box.encrypt(secret_value.encode("utf-8"))
+    return base64.b64encode(encrypted).decode("utf-8")
+
+def set_github_secret(token: str, repo: str, secret_name: str, secret_value: str) -> None:
+    """
+    Sets a secret in the specified GitHub repository.
+
+    Args:
+        token (str): GitHub token with permissions to set secrets.
+        repo (str): Repository path in the format "owner/repo".
+        secret_name (str): The name of the secret to set.
+        secret_value (str): The value of the secret.
+
+    Returns:
+        None
+    """
+    secret_value = secret_value or ""
+    headers = {
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json"
+    }
+    response = requests.get(f"https://api.github.com/repos/{repo}/actions/secrets/public-key", headers=headers)
+    response.raise_for_status()
+    key = response.json()['key']
+    key_id = response.json()['key_id']
+    encrypted_value = encrypt(key, secret_value)
+    secret_url = f"https://api.github.com/repos/{repo}/actions/secrets/{secret_name}"
+    data = {"encrypted_value": encrypted_value, "key_id": key_id}
+    response = requests.put(secret_url, headers=headers, json=data)
+    response.raise_for_status()
+    logger.info(f"set {secret_name=}")
+
+def set_github_secrets() -> None:
+    """
+    Sets required AWS credentials and SSH private key as GitHub Secrets.
+
+    Returns:
+        None
+    """
+    # Set AWS secrets
+    set_github_secret(config.GITHUB_TOKEN, config.GITHUB_PATH, 'AWS_ACCESS_KEY_ID', config.AWS_ACCESS_KEY_ID)
+    set_github_secret(config.GITHUB_TOKEN, config.GITHUB_PATH, 'AWS_SECRET_ACCESS_KEY', config.AWS_SECRET_ACCESS_KEY)
+    set_github_secret(config.GITHUB_TOKEN, config.GITHUB_PATH, 'OPENAI_API_KEY', config.OPENAI_API_KEY)
+
+    # Read the SSH private key from the file
+    try:
+        with open(config.AWS_EC2_KEY_PATH, 'r') as key_file:
+            ssh_private_key = key_file.read()
+        set_github_secret(config.GITHUB_TOKEN, config.GITHUB_PATH, 'SSH_PRIVATE_KEY', ssh_private_key)
+    except IOError as e:
+        logger.error(f"Error reading SSH private key file: {e}")
+
+def create_key_pair(key_name: str = config.AWS_EC2_KEY_NAME, key_path: str = config.AWS_EC2_KEY_PATH) -> str | None:
+    """
+    Creates a new EC2 key pair and saves it to a file.
+
+    Args:
+        key_name (str): The name of the key pair to create. Defaults to config.AWS_EC2_KEY_NAME.
+        key_path (str): The path where the key file should be saved. Defaults to config.AWS_EC2_KEY_PATH.
+
+    Returns:
+        str | None: The name of the created key pair or None if an error occurred.
+    """
+    ec2_client = boto3.client('ec2', region_name=config.AWS_REGION)
+    try:
+        key_pair = ec2_client.create_key_pair(KeyName=key_name)
+        private_key = key_pair['KeyMaterial']
+
+        # Save the private key to a file
+        with open(key_path, "w") as key_file:
+            key_file.write(private_key)
+        os.chmod(key_path, 0o400)  # Set read-only permissions
+
+        logger.info(f"Key pair {key_name} created and saved to {key_path}")
+        return key_name
+    except ClientError as e:
+        logger.error(f"Error creating key pair: {e}")
+        return None
+
+def get_or_create_security_group_id(ports: list[int] = [22, 6092]) -> str | None:
+    """
+    Retrieves or creates a security group with the specified ports opened.
+
+    Args:
+        ports (list[int]): A list of ports to open in the security group. Defaults to [22, 6092].
+
+    Returns:
+        str | None: The ID of the security group, or None if an error occurred.
+    """
+    ec2 = boto3.client('ec2', region_name=config.AWS_REGION)
+
+    # Construct ip_permissions list
+    ip_permissions = [{
+        'IpProtocol': 'tcp',
+        'FromPort': port,
+        'ToPort': port,
+        'IpRanges': [{'CidrIp': '0.0.0.0/0'}]
+    } for port in ports]
+
+    try:
+        response = ec2.describe_security_groups(GroupNames=[config.AWS_EC2_SECURITY_GROUP])
+        security_group_id = response['SecurityGroups'][0]['GroupId']
+        logger.info(f"Security group '{config.AWS_EC2_SECURITY_GROUP}' already exists: {security_group_id}")
+
+        for ip_permission in ip_permissions:
+            try:
+                ec2.authorize_security_group_ingress(
+                    GroupId=security_group_id,
+                    IpPermissions=[ip_permission]
+                )
+                logger.info(f"Added inbound rule to allow TCP traffic on port {ip_permission['FromPort']} from any IP")
+            except ClientError as e:
+                if e.response['Error']['Code'] == 'InvalidPermission.Duplicate':
+                    logger.info(f"Rule for port {ip_permission['FromPort']} already exists")
+                else:
+                    logger.error(f"Error adding rule for port {ip_permission['FromPort']}: {e}")
+
+        return security_group_id
+    except ClientError as e:
+        if e.response['Error']['Code'] == 'InvalidGroup.NotFound':
+            try:
+                # Create the security group
+                response = ec2.create_security_group(
+                    GroupName=config.AWS_EC2_SECURITY_GROUP,
+                    Description='Security group for specified port access',
+                    TagSpecifications=[
+                        {
+                            'ResourceType': 'security-group',
+                            'Tags': [{'Key': 'Name', 'Value': config.PROJECT_NAME}]
+                        }
+                    ]
+                )
+                security_group_id = response['GroupId']
+                logger.info(f"Created security group '{config.AWS_EC2_SECURITY_GROUP}' with ID: {security_group_id}")
+
+                # Add rules for the given ports
+                ec2.authorize_security_group_ingress(GroupId=security_group_id, IpPermissions=ip_permissions)
+                logger.info(f"Added inbound rules to allow access on {ports=}")
+
+                return security_group_id
+            except ClientError as e:
+                logger.error(f"Error creating security group: {e}")
+                return None
+        else:
+            logger.error(f"Error describing security groups: {e}")
+            return None
+
+def deploy_ec2_instance(
+    ami: str = config.AWS_EC2_AMI,
+    instance_type: str = config.AWS_EC2_INSTANCE_TYPE,
+    project_name: str = config.PROJECT_NAME,
+    key_name: str = config.AWS_EC2_KEY_NAME,
+    disk_size: int = config.AWS_EC2_DISK_SIZE,
+) -> tuple[str | None, str | None]:
+    """
+    Deploys an EC2 instance with the specified parameters.
+
+    Args:
+        ami (str): The Amazon Machine Image ID to use for the instance. Defaults to config.AWS_EC2_AMI.
+        instance_type (str): The type of instance to deploy. Defaults to config.AWS_EC2_INSTANCE_TYPE.
+        project_name (str): The project name, used for tagging the instance. Defaults to config.PROJECT_NAME.
+        key_name (str): The name of the key pair to use for the instance. Defaults to config.AWS_EC2_KEY_NAME.
+        disk_size (int): The size of the disk in GB. Defaults to config.AWS_EC2_DISK_SIZE.
+
+    Returns:
+        tuple[str | None, str | None]: A tuple containing the instance ID and IP address, or None, None if deployment fails.
+    """
+    ec2 = boto3.resource('ec2')
+    ec2_client = boto3.client('ec2')
+
+    # Check if key pair exists, if not create one
+    try:
+        ec2_client.describe_key_pairs(KeyNames=[key_name])
+    except ClientError as e:
+        create_key_pair(key_name)
+
+    # Fetch the security group ID
+    security_group_id = get_or_create_security_group_id()
+    if not security_group_id:
+        logger.error("Unable to retrieve security group ID. Instance deployment aborted.")
+        return None, None
+
+    # Check for existing instances
+    instances = ec2.instances.filter(
+        Filters=[
+            {'Name': 'tag:Name', 'Values': [config.PROJECT_NAME]},
+            {'Name': 'instance-state-name', 'Values': ['running', 'pending', 'stopped']}
+        ]
+    )
+
+    for instance in instances:
+        if instance.state['Name'] == 'running':
+            logger.info(f"Instance already running: ID - {instance.id}, IP - {instance.public_ip_address}")
+            return instance.id, instance.public_ip_address
+        elif instance.state['Name'] == 'stopped':
+            logger.info(f"Starting existing stopped instance: ID - {instance.id}")
+            ec2_client.start_instances(InstanceIds=[instance.id])
+            instance.wait_until_running()
+            instance.reload()
+            logger.info(f"Instance started: ID - {instance.id}, IP - {instance.public_ip_address}")
+            return instance.id, instance.public_ip_address
+        elif state == 'pending':
+            logger.info(f"Instance is pending: ID - {instance.id}. Waiting for 'running' state.")
+            try:
+                instance.wait_until_running()  # Wait for the instance to be in 'running' state
+                instance.reload()  # Reload the instance attributes
+                logger.info(f"Instance is now running: ID - {instance.id}, IP - {instance.public_ip_address}")
+                return instance.id, instance.public_ip_address
+            except botocore.exceptions.WaiterError as e:
+                logger.error(f"Error waiting for instance to run: {e}")
+                return None, None
+    # Define EBS volume configuration
+    ebs_config = {
+        'DeviceName': '/dev/sda1',  # You may need to change this depending on the instance type and AMI
+        'Ebs': {
+            'VolumeSize': disk_size,
+            'VolumeType': 'gp3',  # Or other volume types like gp2, io1, etc.
+            'DeleteOnTermination': True  # Set to False if you want to keep the volume after instance termination
+        },
+    }
+
+    # Create a new instance if none exist
+    new_instance = ec2.create_instances(
+        ImageId=ami,
+        MinCount=1,
+        MaxCount=1,
+        InstanceType=instance_type,
+        KeyName=key_name,
+        SecurityGroupIds=[security_group_id],
+        BlockDeviceMappings=[ebs_config],
+        TagSpecifications=[
+            {
+                'ResourceType': 'instance',
+                'Tags': [{'Key': 'Name', 'Value': project_name}]
+            },
+        ]
+    )[0]
+
+    new_instance.wait_until_running()
+    new_instance.reload()
+    logger.info(f"New instance created: ID - {new_instance.id}, IP - {new_instance.public_ip_address}")
+    return new_instance.id, new_instance.public_ip_address
+
+def configure_ec2_instance(
+    instance_id: str | None = None,
+    instance_ip: str | None = None,
+    max_ssh_retries: int = 10,
+    ssh_retry_delay: int = 10,
+    max_cmd_retries: int = 10,
+    cmd_retry_delay: int = 30,
+) -> tuple[str | None, str | None]:
+    """
+    Configures the specified EC2 instance for Docker builds.
+
+    Args:
+        instance_id (str | None): The ID of the instance to configure. If None, a new instance will be deployed. Defaults to None.
+        instance_ip (str | None): The IP address of the instance. Must be provided if instance_id is manually passed. Defaults to None.
+        max_ssh_retries (int): Maximum number of SSH connection retries. Defaults to 10.
+        ssh_retry_delay (int): Delay between SSH connection retries in seconds. Defaults to 10.
+        max_cmd_retries (int): Maximum number of command execution retries. Defaults to 10.
+        cmd_retry_delay (int): Delay between command execution retries in seconds. Defaults to 30.
+
+    Returns:
+        tuple[str | None, str | None]: A tuple containing the instance ID and IP address, or None, None if configuration fails.
+    """
+    if not instance_id:
+        ec2_instance_id, ec2_instance_ip = deploy_ec2_instance()
+    else:
+        ec2_instance_id = instance_id
+        ec2_instance_ip = instance_ip  # Ensure instance IP is provided if instance_id is manually passed
+
+    key = paramiko.RSAKey.from_private_key_file(config.AWS_EC2_KEY_PATH)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+
+    ssh_retries = 0
+    while ssh_retries < max_ssh_retries:
+        try:
+            ssh_client.connect(hostname=ec2_instance_ip, username='ubuntu', pkey=key)
+            break  # Successful SSH connection, break out of the loop
+        except Exception as e:
+            ssh_retries += 1
+            logger.error(f"SSH connection attempt {ssh_retries} failed: {e}")
+            if ssh_retries < max_ssh_retries:
+                logger.info(f"Retrying SSH connection in {ssh_retry_delay} seconds...")
+                time.sleep(ssh_retry_delay)
+            else:
+                logger.error("Maximum SSH connection attempts reached. Aborting.")
+                return
+
+    # Commands to set up the EC2 instance for Docker builds
+    commands = [
+        "sudo apt-get update",
+        "sudo apt-get install -y docker.io",
+        "sudo systemctl start docker",
+        "sudo systemctl enable docker",
+        "sudo usermod -a -G docker ${USER}",
+        "sudo curl -L \"https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)\" -o /usr/local/bin/docker-compose",
+        "sudo chmod +x /usr/local/bin/docker-compose",
+        "sudo ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose",
+    ]
+
+    for command in commands:
+        logger.info(f"Executing command: {command}")
+        cmd_retries = 0
+        while cmd_retries < max_cmd_retries:
+            stdin, stdout, stderr = ssh_client.exec_command(command)
+            exit_status = stdout.channel.recv_exit_status()  # Blocking call
+
+            if exit_status == 0:
+                logger.info(f"Command executed successfully")
+                break
+            else:
+                error_message = stderr.read()
+                if "Could not get lock" in str(error_message):
+                    cmd_retries += 1
+                    logger.warning(f"dpkg is locked, retrying command in {cmd_retry_delay} seconds... Attempt {cmd_retries}/{max_cmd_retries}")
+                    time.sleep(cmd_retry_delay)
+                else:
+                    logger.error(f"Error in command: {command}, Exit Status: {exit_status}, Error: {error_message}")
+                    break  # Non-dpkg lock error, break out of the loop
+
+    ssh_client.close()
+    return ec2_instance_id, ec2_instance_ip
+
+def generate_github_actions_workflow() -> None:
+    """
+    Generates and writes the GitHub Actions workflow file for Docker build on EC2.
+
+    Returns:
+        None
+    """
+    current_branch = get_current_git_branch()
+
+    _, host = deploy_ec2_instance()
+
+    # Set up Jinja2 environment
+    env = Environment(loader=FileSystemLoader('.'))
+    template = env.get_template('docker-build-ec2.yml.j2')
+
+    # Render the template with the current branch
+    rendered_workflow = template.render(
+        branch_name=current_branch,
+        host=host,
+        username=config.AWS_EC2_USER,
+        project_name=config.PROJECT_NAME,
+        github_path=config.GITHUB_PATH,
+        github_repo=config.GITHUB_REPO,
+    )
+
+    # Write the rendered workflow to a file
+    workflows_dir = '.github/workflows'
+    os.makedirs(workflows_dir, exist_ok=True)
+    with open(os.path.join(workflows_dir, 'docker-build-ec2.yml'), 'w') as file:
+        file.write("# Autogenerated via deploy.py, do not edit!\n\n")
+        file.write(rendered_workflow)
+    logger.info("GitHub Actions EC2 workflow file generated successfully.")
+
+def get_current_git_branch() -> str:
+    """
+    Retrieves the current active git branch name.
+
+    Returns:
+        str: The name of the current git branch.
+    """
+    repo = git.Repo(search_parent_directories=True)
+    branch = repo.active_branch.name
+    return branch
+
+def get_github_actions_url() -> str:
+    """
+    Get the GitHub Actions URL for the user's repository.
+
+    Returns:
+        str: The Github Actions URL
+    """
+    url = f"https://github.com/{config.GITHUB_OWNER}/{config.GITHUB_REPO}/actions"
+    return url
+
+def get_gradio_server_url(ip_address: str) -> str:
+    """
+    Get the Gradio server URL using the provided IP address.
+
+    Args:
+        ip_address (str): The IP address of the EC2 instance running the Gradio server.
+
+    Returns:
+        str: The Gradio server URL
+    """
+    url = f"http://{ip_address}:6092"  # TODO: make port configurable
+    return url
+
+def git_push_set_upstream(branch_name: str):
+    """
+    Pushes the current branch to the remote 'origin' and sets it to track the upstream branch.
+
+    Args:
+        branch_name (str): The name of the current branch to push.
+    """
+    try:
+        # Push the current branch and set the remote 'origin' as upstream
+        subprocess.run(["git", "push", "--set-upstream", "origin", branch_name], check=True)
+        logger.info(f"Branch '{branch_name}' pushed and set up to track 'origin/{branch_name}'.")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to push branch '{branch_name}' to 'origin': {e}")
+
+def update_git_remote_with_pat(github_owner: str, repo_name: str, pat: str):
+    """
+    Updates the git remote 'origin' to include the Personal Access Token in the URL.
+
+    Args:
+        github_owner (str): GitHub repository owner.
+        repo_name (str): GitHub repository name.
+        pat (str): Personal Access Token with the necessary scopes.
+
+    """
+    new_origin_url = f"https://{github_owner}:{pat}@github.com/{github_owner}/{repo_name}.git"
+    try:
+        # Remove the existing 'origin' remote
+        subprocess.run(["git", "remote", "remove", "origin"], check=True)
+        # Add the new 'origin' with the PAT in the URL
+        subprocess.run(["git", "remote", "add", "origin", new_origin_url], check=True)
+        logger.info("Git remote 'origin' updated successfully.")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to update git remote 'origin': {e}")
+
+class Deploy:
+
+    @staticmethod
+    def start() -> None:
+        """
+        Main method to execute the deployment process.
+
+        Returns:
+            None
+        """
+        set_github_secrets()
+        instance_id, instance_ip = configure_ec2_instance()
+        assert instance_ip, f"invalid {instance_ip=}"
+        generate_github_actions_workflow()
+
+        # Update the Git remote configuration to include the PAT
+        update_git_remote_with_pat(
+            config.GITHUB_OWNER, config.GITHUB_REPO, config.GITHUB_TOKEN,
+        )
+
+        # Add, commit, and push the workflow file changes, setting the upstream branch
+        try:
+            subprocess.run(
+                ["git", "add", ".github/workflows/docker-build-ec2.yml"], check=True,
+            )
+            subprocess.run(
+                ["git", "commit", "-m", "'add workflow file'"], check=True,
+            )
+            current_branch = get_current_git_branch()
+            git_push_set_upstream(current_branch)
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to commit or push changes: {e}")
+
+        github_actions_url = get_github_actions_url()
+        gradio_server_url = get_gradio_server_url(instance_ip)
+        logger.info("Deployment process completed.")
+        logger.info(f"Check the GitHub Actions at {github_actions_url}.")
+        logger.info("Once the action is complete, run:")
+        logger.info(f"    python client.py {gradio_server_url}")
+
+    @staticmethod
+    def pause(project_name: str = config.PROJECT_NAME) -> None:
+        """
+        Shuts down the EC2 instance associated with the specified project name.
+
+        Args:
+            project_name (str): The project name used to tag the instance. Defaults to config.PROJECT_NAME.
+
+        Returns:
+            None
+        """
+        ec2 = boto3.resource('ec2')
+
+        instances = ec2.instances.filter(
+            Filters=[
+                {'Name': 'tag:Name', 'Values': [project_name]},
+                {'Name': 'instance-state-name', 'Values': ['running']}
+            ]
+        )
+
+        for instance in instances:
+            logger.info(f"Shutting down instance: ID - {instance.id}")
+            instance.stop()
+
+    @staticmethod
+    def stop(
+        project_name: str = config.PROJECT_NAME,
+        security_group_name: str = config.AWS_EC2_SECURITY_GROUP,
+    ) -> None:
+        """
+        Terminates the EC2 instance and deletes the associated security group.
+
+        Args:
+            project_name (str): The project name used to tag the instance. Defaults to config.PROJECT_NAME.
+            security_group_name (str): The name of the security group to delete. Defaults to config.AWS_EC2_SECURITY_GROUP.
+
+        Returns:
+            None
+        """
+        ec2_resource = boto3.resource('ec2')
+        ec2_client = boto3.client('ec2')
+
+        # Terminate EC2 instances
+        instances = ec2_resource.instances.filter(
+            Filters=[
+                {'Name': 'tag:Name', 'Values': [project_name]},
+                {'Name': 'instance-state-name', 'Values': ['pending', 'running', 'shutting-down', 'stopped', 'stopping']}
+            ]
+        )
+
+        for instance in instances:
+            logger.info(f"Terminating instance: ID - {instance.id}")
+            instance.terminate()
+            instance.wait_until_terminated()
+            logger.info(f"Instance {instance.id} terminated successfully.")
+
+        # Delete security group
+        try:
+            ec2_client.delete_security_group(GroupName=security_group_name)
+            logger.info(f"Deleted security group: {security_group_name}")
+        except ClientError as e:
+            if e.response['Error']['Code'] == 'InvalidGroup.NotFound':
+                logger.info(f"Security group {security_group_name} does not exist or already deleted.")
+            else:
+                logger.error(f"Error deleting security group: {e}")
+
+    @staticmethod
+    def status() -> None:
+        """
+        Lists all EC2 instances tagged with the project name.
+
+        Returns:
+            None
+        """
+        ec2 = boto3.resource('ec2')
+
+        instances = ec2.instances.filter(
+            Filters=[{'Name': 'tag:Name', 'Values': [config.PROJECT_NAME]}]
+        )
+
+        for instance in instances:
+            logger.info(f"Instance ID: {instance.id}, State: {instance.state['Name']}")
+
+if __name__ == "__main__":
+    fire.Fire(Deploy)
diff --git a/deploy_requirements.txt b/deploy_requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1eca646406c62a471135cbe5ec88e0805cbde22d
--- /dev/null
+++ b/deploy_requirements.txt
@@ -0,0 +1,9 @@
+boto3==1.34.18
+fire==0.5.0
+gitpython==3.1.41
+jinja2==3.1.3
+loguru==0.7.2
+paramiko==3.4.0
+pydantic_settings==2.1.0
+pynacl==1.5.0
+requests==2.31.0
diff --git a/docker-build-ec2.yml.j2 b/docker-build-ec2.yml.j2
new file mode 100644
index 0000000000000000000000000000000000000000..0a02602d0f831960db32ee07a165f3c8264edbc0
--- /dev/null
+++ b/docker-build-ec2.yml.j2
@@ -0,0 +1,44 @@
+name: Docker Build on EC2 Instance
+
+on:
+  push:
+    branches:
+      - {{ branch_name }}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: SSH and Execute Build on EC2
+        uses: appleboy/ssh-action@master
+        with:
+          command_timeout: "60m"
+          host: {{ host }}
+          username: {{ username }}  # Usually 'ubuntu' or 'ec2-user'
+          {% raw %}
+          key: ${{ secrets.SSH_PRIVATE_KEY }}
+          {% endraw %}
+          script: |
+            source activate pytorch
+            nvidia-smi
+
+            rm -rf {{ github_repo }} || true
+            git clone https://github.com/{{ github_path }}
+            cd {{ github_repo }}
+            git checkout {{ branch_name }}
+            git pull
+
+            # Stop and remove existing container if it's running
+            sudo docker stop {{ project_name }}-container || true
+            sudo docker rm {{ project_name }}-container || true
+
+            # Build the image
+            sudo nvidia-docker build -t {{ project_name }} . || exit 1
+
+            # Run the image
+            sudo docker run -d -p 6092:6092 --gpus all --name {{ project_name }}-container \
+              -e OPENAI_API_KEY={% raw %}${{ secrets.OPENAI_API_KEY }}{% endraw %} \
+              {{ project_name }}
diff --git a/download_ckpt.sh b/download_ckpt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..146fcea2eaf305a5dc91aa258a5e627b3336cbf1
--- /dev/null
+++ b/download_ckpt.sh
@@ -0,0 +1,3 @@
+wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth
+wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c42a4e55ed576eb73b6db42aa864f9cc872733e2
--- /dev/null
+++ b/entrypoint.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Check if OPENAI_API_KEY is set and not empty
+if [ -n "$OPENAI_API_KEY" ]; then
+    # If OPENAI_API_KEY is set, run demo_gpt4v_som.py
+    python ./demo_gpt4v_som.py
+else
+    # If OPENAI_API_KEY is not set, run demo_som.py
+    python ./demo_som.py
+fi
diff --git a/examples/gpt-4v-som-example.jpg b/examples/gpt-4v-som-example.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..01c16c4a354257de526902313648b461138ca71c
Binary files /dev/null and b/examples/gpt-4v-som-example.jpg differ
diff --git a/examples/ironing_man.jpg b/examples/ironing_man.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f3bb120c6a7b60646c4b5c46da881eb22e879798
Binary files /dev/null and b/examples/ironing_man.jpg differ
diff --git a/examples/ironing_man_som.png b/examples/ironing_man_som.png
new file mode 100644
index 0000000000000000000000000000000000000000..9351a504e29af1ab1e6845394a5ec6e089f2f967
Binary files /dev/null and b/examples/ironing_man_som.png differ
diff --git a/examples/som_logo.png b/examples/som_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..8deb5af7087a39ceb39541cd98aa16e7681a8188
Binary files /dev/null and b/examples/som_logo.png differ
diff --git a/gpt4v.py b/gpt4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..1da28c395849fe3147491fdbe1d0b620e9e93b2b
--- /dev/null
+++ b/gpt4v.py
@@ -0,0 +1,69 @@
+import os
+import base64
+import requests
+from io import BytesIO
+
+# Get OpenAI API Key from environment variable
+api_key = os.environ["OPENAI_API_KEY"]
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {api_key}"
+}
+
+metaprompt = '''
+- For any marks mentioned in your answer, please highlight them with [].
+'''    
+
+# Function to encode the image
+def encode_image_from_file(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+def encode_image_from_pil(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+def prepare_inputs(message, image):
+
+    # # Path to your image
+    # image_path = "temp.jpg"
+    # # Getting the base64 string
+    # base64_image = encode_image(image_path)
+    base64_image = encode_image_from_pil(image)
+
+    payload = {
+        "model": "gpt-4-vision-preview",
+        "messages": [
+        {
+            "role": "system",
+            "content": [
+                metaprompt
+            ]
+        }, 
+        {
+            "role": "user",
+            "content": [
+            {
+                "type": "text",
+                "text": message, 
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image}"
+                }
+            }
+            ]
+        }
+        ],
+        "max_tokens": 800
+    }
+
+    return payload
+
+def request_gpt4v(message, image):
+    payload = prepare_inputs(message, image)
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    res = response.json()['choices'][0]['message']['content']
+    return res
diff --git a/ops/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb b/ops/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
new file mode 100644
index 0000000000000000000000000000000000000000..ad19a31c5db9edc62429d7bf7f4f3b27a566bce8
--- /dev/null
+++ b/ops/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c5ab8de3d38ecca4608fdcdf07ef05d1459cf1a8871d65d4ae92621afef9e4
+size 3181876424
diff --git a/ops/functions/__init__.py b/ops/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b06b5ac538b63bdb9a6c82e4635b95bb5491d5b
--- /dev/null
+++ b/ops/functions/__init__.py
@@ -0,0 +1,13 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
diff --git a/ops/functions/ms_deform_attn_func.py b/ops/functions/ms_deform_attn_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a36ab85b7c5f9ecee342db91a5d5731740740f
--- /dev/null
+++ b/ops/functions/ms_deform_attn_func.py
@@ -0,0 +1,72 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+try:
+    import MultiScaleDeformableAttention as MSDA
+except ModuleNotFoundError as e:
+    info_string = (
+        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
+        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
+        "\t`sh make.sh`\n"
+    )
+    raise ModuleNotFoundError(info_string)
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
diff --git a/ops/make.sh b/ops/make.sh
new file mode 100644
index 0000000000000000000000000000000000000000..977811896de2756e7f5aacbc18c6f3882b69120b
--- /dev/null
+++ b/ops/make.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+# Modified by Richard Abrich from https://github.com/OpenAdaptAI/OpenAdapt
+
+# from https://github.com/pytorch/extension-cpp/issues/71#issuecomment-1778326052
+CUDA_VERSION=$(/usr/local/cuda/bin/nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
+if [[ ${CUDA_VERSION} == 9.0* ]]; then
+    export TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;7.0+PTX"
+elif [[ ${CUDA_VERSION} == 9.2* ]]; then
+    export TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0+PTX"
+elif [[ ${CUDA_VERSION} == 10.* ]]; then
+    export TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5+PTX"
+elif [[ ${CUDA_VERSION} == 11.0* ]]; then
+    export TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0+PTX"
+elif [[ ${CUDA_VERSION} == 11.* ]]; then
+    export TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
+elif [[ ${CUDA_VERSION} == 12.* ]]; then
+    export TORCH_CUDA_ARCH_LIST="5.0;5.2;5.3;6.0;6.1;6.2;7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX"
+else
+    echo "unsupported cuda version."
+    exit 1
+fi
+
+python -m pip install git+https://github.com/facebookresearch/detectron2.git
+
+python setup.py build install
diff --git a/ops/modules/__init__.py b/ops/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fdbf03359958f3d67ab00f879bf6b61a6c8f06a
--- /dev/null
+++ b/ops/modules/__init__.py
@@ -0,0 +1,12 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/ops/modules/ms_deform_attn.py b/ops/modules/ms_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b4c42ea504a0859ccadd72646919c941e72f73
--- /dev/null
+++ b/ops/modules/ms_deform_attn.py
@@ -0,0 +1,125 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from ..functions import MSDeformAttnFunction
+from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 128
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        try:
+            output = MSDeformAttnFunction.apply(
+                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        except:
+            # CPU
+            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        # # For FLOPs calculation only
+        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+        return output
diff --git a/ops/setup.py b/ops/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b57ad313ac8f9b6586892142da8ba943e516cec
--- /dev/null
+++ b/ops/setup.py
@@ -0,0 +1,78 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    # Force cuda since torch ask for a device, not if cuda is in fact available.
+    if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        if CUDA_HOME is None:
+            raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
+        else:
+            raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/ops/src/cpu/ms_deform_attn_cpu.cpp b/ops/src/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48757e2b0156b2c1513b615d2a17e5aee5172ae7
--- /dev/null
+++ b/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,46 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
diff --git a/ops/src/cpu/ms_deform_attn_cpu.h b/ops/src/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..51bb27e9ee828f967e8aa854c2d55574040c6d7e
--- /dev/null
+++ b/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,38 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+
diff --git a/ops/src/cuda/ms_deform_attn_cuda.cu b/ops/src/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0c465dab3d636dfd6a44523c63f148b6e15084d9
--- /dev/null
+++ b/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,158 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
\ No newline at end of file
diff --git a/ops/src/cuda/ms_deform_attn_cuda.h b/ops/src/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f0658e8668a11f0e7d71deff9adac71884f2e87
--- /dev/null
+++ b/ops/src/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,35 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/ops/src/cuda/ms_deform_im2col_cuda.cuh b/ops/src/cuda/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c04e0d4ab97d25c1756fcd8d08dd1e5a6d280b7c
--- /dev/null
+++ b/ops/src/cuda/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1332 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
\ No newline at end of file
diff --git a/ops/src/ms_deform_attn.h b/ops/src/ms_deform_attn.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f80a1b294c55b37d13bb3558ff7aeadba3b37de
--- /dev/null
+++ b/ops/src/ms_deform_attn.h
@@ -0,0 +1,67 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/ops/src/vision.cpp b/ops/src/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a08821e0121a77556aa7a263ec8ebfa928b13b6
--- /dev/null
+++ b/ops/src/vision.cpp
@@ -0,0 +1,21 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
diff --git a/ops/test.py b/ops/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e1b545459f6fd3235767e721eb5a1090ae14bef
--- /dev/null
+++ b/ops/test.py
@@ -0,0 +1,92 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
+
+
+
diff --git a/sam_vit_h_4b8939.pth b/sam_vit_h_4b8939.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8523acce9ddab1cf7e355628a08b1aab8ce08a72
--- /dev/null
+++ b/sam_vit_h_4b8939.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e
+size 2564550879
diff --git a/seem_focall_v1.pt b/seem_focall_v1.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce3d4be5786d218fed017ed9649ac6be97219386
--- /dev/null
+++ b/seem_focall_v1.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06cad58bde442ce4f2b3ce00e3a218791dfb060a2d7f2d709ff509669c28a705
+size 1365136278
diff --git a/swinl_only_sam_many2many.pth b/swinl_only_sam_many2many.pth
new file mode 100644
index 0000000000000000000000000000000000000000..60e9c626b88eacda190252b58b0712d22d7549fd
--- /dev/null
+++ b/swinl_only_sam_many2many.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f85bee1faa64154831a75598048d133e345874dbd7ef87d00b6446c1ae772956
+size 895495739
diff --git a/task_adapter/sam/__init__.py b/task_adapter/sam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/task_adapter/sam/tasks/__Init__.py b/task_adapter/sam/tasks/__Init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce45369da0f1796bc51b34aac55c60700e2c7983
--- /dev/null
+++ b/task_adapter/sam/tasks/__Init__.py
@@ -0,0 +1,2 @@
+from .inference_sam_m2m_auto import *
+from .inference_sam_m2m_interactive import *
\ No newline at end of file
diff --git a/task_adapter/sam/tasks/inference_sam_m2m_auto.py b/task_adapter/sam/tasks/inference_sam_m2m_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..d51cf75892ecb6d0ea54c9391e092a57cc8faf92
--- /dev/null
+++ b/task_adapter/sam/tasks/inference_sam_m2m_auto.py
@@ -0,0 +1,103 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from segment_anything import SamAutomaticMaskGenerator
+
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+
+def inference_sam_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+    image_ori = np.asarray(image_ori)
+
+    mask_generator = SamAutomaticMaskGenerator(model)
+    outputs = mask_generator.generate(image_ori)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/sam/tasks/inference_sam_m2m_interactive.py b/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
new file mode 100644
index 0000000000000000000000000000000000000000..5752138eeafc2f675f2a227754938e477886ba40
--- /dev/null
+++ b/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
@@ -0,0 +1,221 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from kornia.contrib import distance_transform
+import matplotlib.pyplot as plt
+import cv2
+import io
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything import SamAutomaticMaskGenerator
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+def sam_interactive_mask(mask_generator, points, in_points, in_labels, mask_input):
+    masks, iou_preds, _ = mask_generator.predictor.predict_torch(
+            in_points,
+            in_labels,
+            mask_input=mask_input,
+            multimask_output=True,
+            return_logits=True,
+    )
+    nm,_,h,w = masks.shape
+
+    # Serialize predictions and store in MaskData
+    data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+    )
+    del masks
+
+    # Calculate stability score
+    data["stability_score"] = calculate_stability_score(
+            data["masks"], mask_generator.predictor.model.mask_threshold, mask_generator.stability_score_offset
+    )
+
+    masks = data["masks"].reshape(nm, -1, h, w)
+    scores = (data['iou_preds'] + data['stability_score']).reshape(nm, -1)
+
+    index = torch.stack([torch.arange(nm).cuda(), scores.argmax(dim=1)]).tolist()
+    return masks[index]
+
+def inference_sam_m2m_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    spatial_masks = spatial_masks[:, None].float().cuda()
+    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
+
+    # generate single center point
+    # n,_,h,w = spatial_masks.shape
+    # mask_dt = (distance_transform((~F.pad(spatial_masks, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
+    # max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+    # next_mask = torch.zeros(spatial_masks.shape, device=torch.cuda.current_device()).bool()
+    # next_mask = next_mask.view(n,-1)
+    # next_mask[max_xy_idx] = True
+    # next_mask = next_mask.reshape((n,1,h,w))
+    # points = next_mask.nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
+
+    # stack sampled points
+    acc_points = []
+    for i in range(len(spatial_masks)):
+        points = spatial_masks[i:i+1].nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
+        rand_ids = np.random.choice(points.shape[0], size=40, replace=True)
+        points = points[rand_ids]
+        acc_points.append(points)
+    _np = len(acc_points)
+    points = np.concatenate(acc_points)
+
+    mask_generator = SamAutomaticMaskGenerator(model)
+    mask_generator.predictor.set_image(image_ori)
+    im_size = image_ori.shape[:-1]
+
+    transformed_points = mask_generator.predictor.transform.apply_coords(points, im_size)
+    in_points = torch.as_tensor(transformed_points, device=mask_generator.predictor.device).reshape(_np,-1,2).transpose(0,1)
+    in_labels = torch.ones((in_points.shape[0], _np), dtype=torch.int, device=mask_generator.predictor.device)
+
+    masks = sam_interactive_mask(mask_generator, points, in_points.transpose(0,1), in_labels.transpose(0,1), None)
+
+    masks = masks > 0.0
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/seem/__init__.py b/task_adapter/seem/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/task_adapter/seem/tasks/__init__.py b/task_adapter/seem/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff951407c3dd24407853902afabb732615b2b37
--- /dev/null
+++ b/task_adapter/seem/tasks/__init__.py
@@ -0,0 +1,3 @@
+from .interactive_seem_m2m_auto import *
+from .inference_seem_pano import *
+from .inference_seem_interactive import *
\ No newline at end of file
diff --git a/task_adapter/seem/tasks/automatic_mask_generator.py b/task_adapter/seem/tasks/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e2317939a7c45fda854e4305f5f89232ef4aed
--- /dev/null
+++ b/task_adapter/seem/tasks/automatic_mask_generator.py
@@ -0,0 +1,382 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from segment_anything.modeling import Sam
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+class SeemAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.9,
+        stability_score_thresh: float = 0.5,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+        # dilate conv
+        self.dilation = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=7, stride=1, padding=3, bias=False)
+        self.dilation.weight.data.fill_(1.0)
+        self.dilation.cuda()
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[-2:]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image#[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[-2:]
+        # self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        self.enc_features=None
+
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(cropped_im, points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros(len(data["boxes"])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        images,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        data = {"image": images, "height": orig_h, "width": orig_w}
+        points = torch.tensor(points,dtype=torch.float).to(images.device)
+        
+        # prepare interactive mask for seem
+        abs_points = (points * torch.tensor(orig_size)[None,:].to(points.device)).long()
+        abs_masks = torch.zeros((len(points), orig_h, orig_w), dtype=torch.bool).to(device=points.device)
+        abs_masks[torch.arange(0, abs_points.size(0))[:,None], abs_points[:,0:1], abs_points[:,1:2]] = True
+        abs_masks = self.dilation(abs_masks[:,None].float())[:,0] > 0
+        data['spatial_query'] = {'rand_shape': abs_masks[:,None]}
+
+        batch_inputs = [data]
+        if self.enc_features is None:
+            masks, iou_preds, mask_features, transformer_encoder_features, multi_scale_features = self.predictor.model.evaluate_demo(batch_inputs, None, None, return_features=True)
+            self.enc_features = (mask_features, transformer_encoder_features, multi_scale_features)
+        else:
+            masks, iou_preds = self.predictor.model.evaluate_demo(batch_inputs, self.enc_features[0], self.enc_features[1], self.enc_features[2])
+
+        data = MaskData(
+            masks=masks,
+            iou_preds=iou_preds,
+            points=points,
+        )
+        del masks
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], 0.0, self.stability_score_offset
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > 0.0
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
\ No newline at end of file
diff --git a/task_adapter/seem/tasks/inference_seem_interactive.py b/task_adapter/seem/tasks/inference_seem_interactive.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4b3ce9aea560159a8cddb53ed706fdd1a1ec43d
--- /dev/null
+++ b/task_adapter/seem/tasks/inference_seem_interactive.py
@@ -0,0 +1,169 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+
+    spatial_masks = spatial_masks[:, None].float().cuda()
+    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
+    data['spatial_query'] = {'rand_shape': spatial_masks}
+
+    model.model.metadata = metadata
+    masks, _ = model.model.evaluate_demo([data])
+    masks = masks > 0.0
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/seem/tasks/inference_seem_pano.py b/task_adapter/seem/tasks/inference_seem_pano.py
new file mode 100644
index 0000000000000000000000000000000000000000..d75af481a76a84b3ff377175734c430808a24100
--- /dev/null
+++ b/task_adapter/seem/tasks/inference_seem_pano.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_pano(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+    batch_inputs = [data]
+
+    model.model.metadata = metadata
+    outputs = model.model.evaluate(batch_inputs)
+
+    pano_mask = outputs[0]['panoptic_seg'][0]
+    pano_info = outputs[0]['panoptic_seg'][1]
+
+    masks = []
+    for seg_info in pano_info:
+        masks += [pano_mask == seg_info['id']]
+    masks = torch.stack(masks, dim=0)
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    # create a full zero image as the image_orig
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/seem/tasks/interactive_seem_m2m_auto.py b/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35a6a45de029f823cf3fee8671addbc728b2e09
--- /dev/null
+++ b/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
@@ -0,0 +1,93 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_seem_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    mask_generator = SeemAutomaticMaskGenerator(model)
+    outputs = mask_generator.generate(images)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    for ann in sorted_anns:
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        label += 1
+    im = demo.get_image()
+
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/__init__.py b/task_adapter/semantic_sam/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e1951beacaaaa0b64235b0b2efeaf54f1866bf
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/__init__.py
@@ -0,0 +1,6 @@
+from .interactive_idino_m2m import interactive_infer_image as interactive_infer_image_idino_m2m
+from .interactive_idino_m2m import interactive_infer_image_semantic, interactive_infer_image_3l
+from .inference_semsam_m2m_auto import inference_semsam_m2m_auto
+from .interactive_idino_1o1_box import interactive_infer_image_box as interactive_infer_image_idino_m2m_box
+from .automatic_mask_generator import prompt_switch
+from .interactive_predictor import SemanticSAMPredictor
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/automatic_mask_generator.py b/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe28899c35974dc1321be0b36330762f49778877
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
@@ -0,0 +1,393 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+# from
+# from .modeling import Sam
+# from .predictor import SamPredictor
+from semantic_sam.utils.sam_utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def prompt_switch(p):
+    p = int(p)
+    if p == 1:
+        return 3
+    if p == 2:
+        return 2
+    if p == 3:
+        return 0
+    if p == 4:
+        return 4
+    if p == 5:
+        return 1
+    if p == 6:
+        return 5
+    else:
+        raise NotImplementedError
+
+
+class SemanticSamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 200,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.92,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 10,
+        output_mode: str = "binary_mask",
+        level: list = [1, 2, 3, 4, 5, 6],
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crops_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crops_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+        self.level = [prompt_switch(l) for l in level]
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[-2:]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        assert len(crop_boxes)==1
+        data = MaskData()
+        # import ipdb; ipdb.set_trace()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+
+            data.cat(crop_data)
+        # import ipdb; ipdb.set_trace()
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros(len(data["boxes"])),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image#[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[-2:]
+        # self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        self.enc_features=None
+        # import ipdb; ipdb.set_trace()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(cropped_im,points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros(len(data["boxes"])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        # import ipdb; ipdb.set_trace()
+        data.filter(keep_by_nms)
+        # import ipdb; ipdb.set_trace()
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        images,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        data = {"image": images, "height": orig_h, "width": orig_w}
+        points=torch.tensor(points,dtype=torch.float).to(images.device)
+        points = torch.cat([points, points.new_tensor([[0.005, 0.005]]).repeat(len(points), 1)], dim=-1)
+        data['targets'] = [dict()]
+        data['targets'][0]['points']=points
+        data['targets'][0]['pb']=points.new_tensor([0.]*len(points))
+        batch_inputs = [data]
+        if self.enc_features is None:
+            masks, iou_preds,mask_features,multi_scale_features= self.predictor.model.evaluate_demo(batch_inputs,None,None,return_features=True, level=self.level)
+            self.enc_features=(mask_features,multi_scale_features)
+        else:
+            masks, iou_preds= self.predictor.model.evaluate_demo(batch_inputs,None,None,self.enc_features[0],self.enc_features[1], level=self.level)
+
+        data = MaskData(
+            masks=masks,
+            iou_preds=iou_preds.flatten(),
+            points=torch.as_tensor(points[:,None].repeat(1,len(self.level), 1).view(-1,4)),
+        )
+        del masks
+        # Filter by predicted IoU
+        keep_mask = data["iou_preds"] > self.pred_iou_thresh
+        data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], 0.0, self.stability_score_offset
+        )
+        # if self.stability_score_thresh > 0.0:
+        keep_mask = data["stability_score"] >= self.stability_score_thresh
+        data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > 0.0
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros(len(boxes)),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py b/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..a939a3c878938d415b337391c741b352dfe96f9a
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
@@ -0,0 +1,108 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SemanticSamAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def inference_semsam_m2m_auto(model, image, level, all_classes, all_parts, thresh, text_size, hole_scale, island_scale, semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    mask_generator = SemanticSamAutomaticMaskGenerator(model,points_per_side=32,
+            pred_iou_thresh=0.88,
+            stability_score_thresh=0.92,
+            min_mask_region_area=10,
+            level=level,
+        )
+    outputs = mask_generator.generate(images)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py b/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfe774bf82c05e8345c92374f5f87e07264ee92
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
@@ -0,0 +1,144 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks
+from semantic_sam.utils import box_ops
+
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_infer_image_box(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    flaten_mask = mask_ori.unsqueeze(0)
+    # import ipdb; ipdb.set_trace()
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.5,0.5]])
+    else:
+        mean_point=points.mean(0)[None]
+        box_xyxy = BitMasks(flaten_mask > 0).get_bounding_boxes().tensor
+        h = mask_ori.shape[0]
+        w = mask_ori.shape[1]
+        box_xywh = (box_ops.box_xyxy_to_cxcywh(box_xyxy) / torch.as_tensor([w, h, w, h])).cuda()
+
+        # point_=points.mean(0)[None]
+        # point=point_.clone()
+        # point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        # point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        # point = point[:, [1, 0]]
+        point=box_xywh
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([1.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, task='demo_box')
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        demo = visual.draw_box(box_xyxy[0])
+        res = demo.get_image()
+        # point_x0=max(0,int(point_[0, 1])-3)
+        # point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        # point_y0 = max(0, int(point_[0, 0]) - 3)
+        # point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        # res[point_y0:point_y1,point_x0:point_x1,0]=255
+        # res[point_y0:point_y1,point_x0:point_x1,1]=0
+        # res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py b/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
new file mode 100644
index 0000000000000000000000000000000000000000..93775c38611b1da9825003e70356e6c5a0523bae
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
@@ -0,0 +1,322 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_infer_image(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        # demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        # res[point_y0:point_y1,point_x0:point_x1,0]=255
+        # res[point_y0:point_y1,point_x0:point_x1,1]=0
+        # res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def interactive_infer_image_3l(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+
+
+    batch_inputs = [data]
+    masks, ious, pred_class, pred_class_score = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, level=[0,1,2])
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    new_pred_class = []
+    new_pred_class_score = []
+    for i in ids:
+        new_pred_class_score.append(pred_class_score[i])
+        new_pred_class.append(pred_class[i])
+    # import ipdb; ipdb.set_trace()
+    for i,(pred_masks_pos,iou, cls_name, cls_score) in enumerate(zip(pred_masks_poses[ids],ious[ids], new_pred_class, new_pred_class_score)):
+        iou=round(float(iou),2)
+        texts=f'{iou}_{cls_name}_{cls_score}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        res[point_y0:point_y1,point_x0:point_x1,0]=255
+        res[point_y0:point_y1,point_x0:point_x1,1]=0
+        res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def interactive_infer_image_semantic(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+    data['targets'][0]['pb']=point.new_tensor([1.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        res[point_y0:point_y1,point_x0:point_x1,0]=255
+        res[point_y0:point_y1,point_x0:point_x1,1]=0
+        res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
\ No newline at end of file
diff --git a/task_adapter/semantic_sam/tasks/interactive_predictor.py b/task_adapter/semantic_sam/tasks/interactive_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..70252a37e516b0473205f628c1a08a9951ede1e6
--- /dev/null
+++ b/task_adapter/semantic_sam/tasks/interactive_predictor.py
@@ -0,0 +1,139 @@
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+
+class SemanticSAMPredictor:
+    def __init__(self, model, thresh=0.5, text_size=640, hole_scale=100, island_scale=100):
+        """
+        thresh: iou thresh to filter low confidence objects
+        text_size: resize the input image short edge for the model to process
+        hole_scale: fill in small holes as in SAM
+        island_scale: remove small regions as in SAM
+        """
+        self.model = model
+        self.thresh = thresh
+        self.text_size = hole_scale
+        self.hole_scale = hole_scale
+        self.island_scale = island_scale
+        self.point = None
+
+    def predict(self, image_ori, image, point=None):
+        """
+        produce up to 6 prediction results for each click
+        """
+        width = image_ori.shape[0]
+        height = image_ori.shape[1]
+
+        data = {"image": image, "height": height, "width": width}
+        # import ipdb; ipdb.set_trace()
+        if point is None:
+            point = torch.tensor([[0.5, 0.5, 0.006, 0.006]]).cuda()
+        else:
+            point = torch.tensor(point).cuda()
+            point_ = point
+            point = point_.clone()
+            point[0, 0] = point_[0, 0]
+            point[0, 1] = point_[0, 1]
+            # point = point[:, [1, 0]]
+            point = torch.cat([point, point.new_tensor([[0.005, 0.005]])], dim=-1)
+
+        self.point = point[:, :2].clone()*(torch.tensor([width, height]).to(point))
+
+        data['targets'] = [dict()]
+        data['targets'][0]['points'] = point
+        data['targets'][0]['pb'] = point.new_tensor([0.])
+
+        batch_inputs = [data]
+        masks, ious = self.model.model.evaluate_demo(batch_inputs)
+
+        return masks, ious
+
+    def process_multi_mask(self, masks, ious, image_ori):
+        pred_masks_poses = masks
+        reses = []
+        ious = ious[0, 0]
+        ids = torch.argsort(ious, descending=True)
+
+        text_res = ''
+        mask_ls = []
+        ious_res = []
+        areas = []
+        for i, (pred_masks_pos, iou) in enumerate(zip(pred_masks_poses[ids], ious[ids])):
+            iou = round(float(iou), 2)
+            texts = f'{iou}'
+            mask = (pred_masks_pos > 0.0).cpu().numpy()
+            area = mask.sum()
+            conti = False
+            if iou < self.thresh:
+                conti = True
+            for m in mask_ls:
+                if np.logical_and(mask, m).sum() / np.logical_or(mask, m).sum() > 0.95:
+                    conti = True
+                    break
+            if i == len(pred_masks_poses[ids]) - 1 and mask_ls == []:
+                conti = False
+            if conti:
+                continue
+            ious_res.append(iou)
+            mask_ls.append(mask)
+            areas.append(area)
+            mask, _ = self.remove_small_regions(mask, int(self.hole_scale), mode="holes")
+            mask, _ = self.remove_small_regions(mask, int(self.island_scale), mode="islands")
+            mask = (mask).astype(np.float)
+            out_txt = texts
+            visual = Visualizer(image_ori, metadata=metadata)
+            color = [0., 0., 1.0]
+            demo = visual.draw_binary_mask(mask, color=color, text=texts)
+            res = demo.get_image()
+            point_x0 = max(0, int(self.point[0, 0]) - 3)
+            point_x1 = min(image_ori.shape[1], int(self.point[0, 0]) + 3)
+            point_y0 = max(0, int(self.point[0, 1]) - 3)
+            point_y1 = min(image_ori.shape[0], int(self.point[0, 1]) + 3)
+            res[point_y0:point_y1, point_x0:point_x1, 0] = 255
+            res[point_y0:point_y1, point_x0:point_x1, 1] = 0
+            res[point_y0:point_y1, point_x0:point_x1, 2] = 0
+            reses.append(Image.fromarray(res))
+            text_res = text_res + ';' + out_txt
+        ids = list(torch.argsort(torch.tensor(areas), descending=False))
+        ids = [int(i) for i in ids]
+
+        torch.cuda.empty_cache()
+
+        return reses, [reses[i] for i in ids]
+
+    def predict_masks(self, image_ori, image, point=None):
+        masks, ious = self.predict(image_ori, image, point)
+        return self.process_multi_mask(masks, ious, image_ori)
+
+    @staticmethod
+    def remove_small_regions(
+            mask: np.ndarray, area_thresh: float, mode: str
+    ) -> Tuple[np.ndarray, bool]:
+        """
+        Removes small disconnected regions and holes in a mask. Returns the
+        mask and an indicator of if the mask has been modified.
+        """
+        import cv2  # type: ignore
+
+        assert mode in ["holes", "islands"]
+        correct_holes = mode == "holes"
+        working_mask = (correct_holes ^ mask).astype(np.uint8)
+        n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+        sizes = stats[:, -1][1:]  # Row 0 is background label
+        small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+        if len(small_regions) == 0:
+            return mask, False
+        fill_labels = [0] + small_regions
+        if not correct_holes:
+            fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+            # If every region is below threshold, keep largest
+            if len(fill_labels) == 0:
+                fill_labels = [int(np.argmax(sizes)) + 1]
+        mask = np.isin(regions, fill_labels)
+        return mask, True
diff --git a/task_adapter/utils/visualizer.py b/task_adapter/utils/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd78a98e3f862f8b74d921c4b80be3a8d2f0dd2b
--- /dev/null
+++ b/task_adapter/utils/visualizer.py
@@ -0,0 +1,1405 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+
+from detectron2.utils.colormap import random_color
+import random
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    # TODO implement a fast, rasterized version using OpenCV
+
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._default_font_size = 18
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+
+        import matplotlib.colors as mcolors
+        css4_colors = mcolors.CSS4_COLORS
+        self.color_proposals = [list(mcolors.hex2color(color)) for color in css4_colors.values()]
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        keep = (scores > 0.5).cpu()
+        boxes = boxes[keep]
+        scores = scores[keep]
+        classes = np.array(classes)
+        classes = classes[np.array(keep)]
+        labels = np.array(labels)
+        labels = labels[np.array(keep)]
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = masks[np.array(keep)]
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+        # if self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.4
+        else:
+            colors = None
+            alpha = 0.4
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+            alpha = 0.3
+        
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.7):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
+        """
+        Draw panoptic prediction annotations or results.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx].replace('-other','').replace('-merged','')
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        class_names = [name.replace('-other','').replace('-merged','') for name in self.metadata.thing_classes]
+        labels = _create_text_labels(
+            category_ids, scores, class_names, [x.get("iscrowd", 0) for x in sinfo]
+        )
+
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [
+                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.4)
+
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.7)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5,
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    # skip small mask without polygon
+                    if len(masks[i].polygons) == 0:
+                        continue
+
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.15)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        def contrasting_color(rgb):
+            """Returns 'white' or 'black' depending on which color contrasts more with the given RGB value."""
+            
+            # Decompose the RGB tuple
+            R, G, B = rgb
+
+            # Calculate the Y value
+            Y = 0.299 * R + 0.587 * G + 0.114 * B
+
+            # If Y value is greater than 128, it's closer to white so return black. Otherwise, return white.
+            return 'black' if Y > 128 else 'white'
+
+        bbox_background = contrasting_color(color*255)
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": bbox_background, "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 12, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.7, area_threshold=10
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None and has_valid_segment:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    
+    def draw_binary_mask_with_number(
+        self, binary_mask, color=None, *, edge_color=None, text=None, label_mode='1', alpha=0.1, anno_mode=['Mask'], area_threshold=10
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            randint = random.randint(0, len(self.color_proposals)-1)
+            color = self.color_proposals[randint]
+        color = mplc.to_rgb(color)
+
+        has_valid_segment = True
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+        bbox = mask.bbox()
+
+        if 'Mask' in anno_mode:
+            if not mask.has_holes:
+                # draw polygons for regular masks
+                for segment in mask.polygons:
+                    area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                    if area < (area_threshold or 0):
+                        continue
+                    has_valid_segment = True
+                    segment = segment.reshape(-1, 2)
+                    self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+            else:
+                # TODO: Use Path/PathPatch to draw vector graphics:
+                # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+                rgba = np.zeros(shape2d + (4,), dtype="float32")
+                rgba[:, :, :3] = color
+                rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+                has_valid_segment = True
+                self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if 'Box' in anno_mode:
+            self.draw_box(bbox, edge_color=color, alpha=0.75)
+
+        if 'Mark' in anno_mode:
+            has_valid_segment = True
+        else:
+            has_valid_segment = False
+
+        if text is not None and has_valid_segment:
+            # lighter_color = tuple([x*0.2 for x in color])
+            lighter_color = [1,1,1] # self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_number_in_mask(binary_mask, text, lighter_color, label_mode)
+        return self.output
+
+    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
+        """
+        Args:
+            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
+        rgba = np.zeros(shape2d + (4,), dtype="float32")
+        rgba[:, :, :3] = color
+        rgba[:, :, 3] = soft_mask * alpha
+        self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            binary_mask = (soft_mask > 0.5).astype("uint8")
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        # np.random.seed(0)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _draw_number_in_mask(self, binary_mask, text, color, label_mode='1'):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+
+        def number_to_string(n):
+            chars = []
+            while n:
+                n, remainder = divmod(n-1, 26)
+                chars.append(chr(97 + remainder))
+            return ''.join(reversed(chars))
+
+        binary_mask = np.pad(binary_mask, ((1, 1), (1, 1)), 'constant')
+        mask_dt = cv2.distanceTransform(binary_mask, cv2.DIST_L2, 0)
+        mask_dt = mask_dt[1:-1, 1:-1]
+        max_dist = np.max(mask_dt)
+        coords_y, coords_x = np.where(mask_dt == max_dist)  # coords is [y, x]
+
+        if label_mode == 'a':
+            text = number_to_string(int(text))
+        else:
+            text = text
+
+        self.draw_text(text, (coords_x[len(coords_x)//2] + 2, coords_y[len(coords_y)//2] - 6), color=color)
+
+        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+        # _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+        # if stats[1:, -1].size == 0:
+        #     return
+        # largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+        # # draw text on the largest component, as well as other very large components.
+        # for cid in range(1, _num_cc):
+        #     if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+        #         # median is more stable than centroid
+        #         # center = centroids[largest_component_id]
+        #         center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+        #         # bottom=np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
+        #         # center[1]=bottom[1]+2
+        #         self.draw_text(text, center, color=color)
+    
+    def _draw_text_in_mask(self, binary_mask, text, color):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+        if stats[1:, -1].size == 0:
+            return
+        largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+        # draw text on the largest component, as well as other very large components.
+        for cid in range(1, _num_cc):
+            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                # median is more stable than centroid
+                # center = centroids[largest_component_id]
+                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                bottom=np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
+                center[1]=bottom[1]+2
+                self.draw_text(text, center, color=color)
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
\ No newline at end of file